RED-8773 - Fix images not appearing on specific file

This commit is contained in:
Andrei Isvoran 2024-04-03 10:20:46 +03:00
parent 8bd0de6263
commit 456b8fe4a1
3 changed files with 154 additions and 60 deletions

View File

@ -54,9 +54,14 @@ public class DocumentGraphFactory {
Document documentGraph = new Document();
Context context = new Context(documentGraph);
document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(layoutParsingType, document, context);
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
document.getSections()
.stream()
.flatMap(section -> section.getImages()
.stream())
.forEach(image -> context.getImages().add(image));
addSections(layoutParsingType, document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
@ -67,9 +72,10 @@ public class DocumentGraphFactory {
}
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context));
classificationDocument.getSections()
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
}
@ -79,11 +85,14 @@ public class DocumentGraphFactory {
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
node = Headline.builder().documentTree(context.getDocumentTree())
.build();
} else if (originalTextBlock.isToDuplicate()) {
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree())
.build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
node = Paragraph.builder().documentTree(context.getDocumentTree())
.build();
}
page.getMainBody().add(node);
@ -96,8 +105,9 @@ public class DocumentGraphFactory {
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
.flatMap(tb -> tb.getSequences().stream())
.collect(Collectors.toList()), node, context, page);
.flatMap(tb -> tb.getSequences()
.stream())
.collect(Collectors.toList()), node, context, page);
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
}
@ -109,23 +119,34 @@ public class DocumentGraphFactory {
public void addImage(Section section, ClassifiedImage image, Context context) {
Image imageNode = createImage(image, context);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(treeId);
}
public void addImage(Document document, ClassifiedImage image, Context context) {
Image imageNode = createImage(image, context);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(document, imageNode);
imageNode.setTreeId(treeId);
}
private Image createImage(ClassifiedImage image, Context context) {
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
var imageBuilder = Image.builder()
Image imageNode = Image.builder()
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
.imageType(image.getImageType())
.position(position)
.transparent(image.isHasTransparency())
.page(page)
.documentTree(context.getDocumentTree());
if (image.isSourceByAi()) {
imageBuilder.engines(new HashSet<>(Set.of(LayoutEngine.AI)));
}
Image imageNode = imageBuilder.build();
.documentTree(context.getDocumentTree())
.build();
page.getMainBody().add(imageNode);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(treeId);
return imageNode;
}
@ -164,11 +185,12 @@ public class DocumentGraphFactory {
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
footer,
context,
page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -179,7 +201,8 @@ public class DocumentGraphFactory {
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
@ -191,7 +214,8 @@ public class DocumentGraphFactory {
private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
@ -203,7 +227,8 @@ public class DocumentGraphFactory {
private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);

View File

@ -13,6 +13,7 @@ import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
@ -30,27 +31,48 @@ public class SectionNodeFactory {
GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context) {
DocumentGraphFactory.Context context,
Document document) {
// This is for the case where we have images on a page without any text/footer/header.
// The pageBlocks list is empty, but we still need to add those images to the document.
if (!images.isEmpty() && pageBlocks.isEmpty()) {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
return;
}
if (pageBlocks.isEmpty()) {
return;
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree())
.build();
context.getSections().add(section);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
blocksPerPage.keySet()
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context));
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section,
subSectionPageBlocks,
emptyList(),
context,
document));
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section);
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
}
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
}
@ -64,10 +86,14 @@ public class SectionNodeFactory {
}
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section,
Document document) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section);
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document);
pageBlocks.remove(0);
}
}
@ -76,7 +102,8 @@ public class SectionNodeFactory {
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
Section section) {
Section section,
Document document) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
@ -105,7 +132,7 @@ public class SectionNodeFactory {
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context);
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
}
@ -115,7 +142,9 @@ public class SectionNodeFactory {
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
return pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
}
@ -131,7 +160,9 @@ public class SectionNodeFactory {
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
movePrecedingHeadlineToTableList(splitList);
return splitList.stream().filter(list -> !list.isEmpty()).toList();
return splitList.stream()
.filter(list -> !list.isEmpty())
.toList();
}
@ -152,7 +183,8 @@ public class SectionNodeFactory {
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
return abstractPageBlocks.stream()
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
}

View File

@ -9,6 +9,7 @@ import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
@ -28,23 +29,30 @@ public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
public void addTable(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
List<TablePageBlock> tablesToMerge,
DocumentGraphFactory.Context context,
Document document) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
Set<Page> pages = tablesToMerge.stream()
.map(AbstractPageBlock::getPage)
.map(context::getPage)
.collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream()
.map(TablePageBlock::getRows)
.flatMap(Collection::stream)
.toList();
Table table = Table.builder()
.documentTree(context.getDocumentTree())
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
.numberOfRows(mergedRows.size())
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
.build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(layoutParsingType, mergedRows, table, context);
addTableCells(layoutParsingType, mergedRows, table, context, document);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
@ -64,7 +72,8 @@ public class TableNodeFactory {
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
cell.getTextBlocks().stream()//
cell.getTextBlocks()
.stream()//
.filter(tb -> tb.getPage() == 0)//
.forEach(tb -> tb.setPage(table.getPage()));
}
@ -83,28 +92,44 @@ public class TableNodeFactory {
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders().findAny().isEmpty()) {
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
if (table.streamHeaders()
.findAny().isEmpty()) {
table.streamRow(0)
.forEach(tableCellNode -> tableCellNode.setHeader(true));
}
}
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
addTableCell(layoutParsingType,
rows.get(rowIndex)
.get(colIndex),
rowIndex,
colIndex,
table,
context,
document);
}
}
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
private void addTableCell(LayoutParsingType layoutParsingType,
Cell cell,
int rowIndex,
int colIndex,
Table tableNode,
DocumentGraphFactory.Context context,
Document document) {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
.build();
page.getMainBody().add(tableCell);
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
@ -114,16 +139,27 @@ public class TableNodeFactory {
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
textBlock = context.getTextBlockFactory()
.buildAtomicTextBlock(cell.getTextBlocks()
.get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
SectionNodeFactory.addSection(layoutParsingType,
tableCell,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.toList(),
emptyList(),
context,
document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
cell.getTextBlocks()
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
}
}
@ -136,7 +172,8 @@ public class TableNodeFactory {
private boolean firstTextBlockIsHeadline(Cell cell) {
return cell.getTextBlocks().get(0).isHeadline();
return cell.getTextBlocks()
.get(0).isHeadline();
}
}