RED-8773 - Fix images not appearing on specific file
This commit is contained in:
parent
1ca02f72c8
commit
34b260bb60
@ -49,9 +49,14 @@ public class DocumentGraphFactory {
|
||||
Document documentGraph = new Document();
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context);
|
||||
document.getPages()
|
||||
.forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getImages()
|
||||
.stream())
|
||||
.forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context, documentGraph);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
documentGraph.setNumberOfPages(context.pages.size());
|
||||
@ -62,9 +67,10 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addSections(ClassificationDocument document, Context context) {
|
||||
private void addSections(ClassificationDocument classificationDocument, Context context, Document document) {
|
||||
|
||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
|
||||
classificationDocument.getSections()
|
||||
.forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||
}
|
||||
|
||||
|
||||
@ -74,9 +80,11 @@ public class DocumentGraphFactory {
|
||||
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
node = Headline.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
@ -93,6 +101,22 @@ public class DocumentGraphFactory {
|
||||
|
||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
||||
|
||||
Image imageNode = createImage(image, context);
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||
imageNode.setTreeId(treeId);
|
||||
}
|
||||
|
||||
|
||||
public void addImage(Document document, ClassifiedImage image, Context context) {
|
||||
|
||||
Image imageNode = createImage(image, context);
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(document, imageNode);
|
||||
imageNode.setTreeId(treeId);
|
||||
}
|
||||
|
||||
|
||||
private Image createImage(ClassifiedImage image, Context context) {
|
||||
|
||||
Rectangle2D position = image.getPosition();
|
||||
Page page = context.getPage(image.getPage());
|
||||
Image imageNode = Image.builder()
|
||||
@ -104,9 +128,7 @@ public class DocumentGraphFactory {
|
||||
.documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
page.getMainBody().add(imageNode);
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||
imageNode.setTreeId(treeId);
|
||||
return imageNode;
|
||||
}
|
||||
|
||||
|
||||
@ -145,11 +167,12 @@ public class DocumentGraphFactory {
|
||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
@ -160,7 +183,8 @@ public class DocumentGraphFactory {
|
||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
@ -172,7 +196,8 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
@ -184,7 +209,8 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
|
||||
@ -11,6 +11,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -24,27 +25,46 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
||||
public void addSection(GenericSemanticNode parentNode,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
Document document) {
|
||||
|
||||
// This is for the case where we have images on a page without any text/footer/header.
|
||||
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||
if (!images.isEmpty() && pageBlocks.isEmpty()) {
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||
return;
|
||||
}
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
blocksPerPage.keySet()
|
||||
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section, document);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context, document));
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section, document);
|
||||
}
|
||||
|
||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
}
|
||||
|
||||
|
||||
@ -58,16 +78,16 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section, document);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
@ -86,7 +106,7 @@ public class SectionNodeFactory {
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context, document);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
}
|
||||
@ -96,7 +116,9 @@ public class SectionNodeFactory {
|
||||
|
||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||
return pageBlocks.stream()
|
||||
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
|
||||
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||
}
|
||||
|
||||
|
||||
@ -112,7 +134,9 @@ public class SectionNodeFactory {
|
||||
|
||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||
movePrecedingHeadlineToTableList(splitList);
|
||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
||||
return splitList.stream()
|
||||
.filter(list -> !list.isEmpty())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -133,7 +157,8 @@ public class SectionNodeFactory {
|
||||
|
||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||
|
||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||
return abstractPageBlocks.stream()
|
||||
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
@ -27,23 +28,26 @@ public class TableNodeFactory {
|
||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
|
||||
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
setPageNumberInCells(tablesToMerge);
|
||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
||||
Set<Page> pages = tablesToMerge.stream()
|
||||
.map(AbstractPageBlock::getPage)
|
||||
.map(context::getPage)
|
||||
.collect(Collectors.toSet());
|
||||
List<List<Cell>> mergedRows = tablesToMerge.stream()
|
||||
.map(TablePageBlock::getRows)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
|
||||
Table table = Table.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
||||
.numberOfRows(mergedRows.size())
|
||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
||||
.build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(mergedRows, table, context);
|
||||
addTableCells(mergedRows, table, context, document);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
@ -63,7 +67,8 @@ public class TableNodeFactory {
|
||||
|
||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||
|
||||
cell.getTextBlocks().stream()//
|
||||
cell.getTextBlocks()
|
||||
.stream()//
|
||||
.filter(tb -> tb.getPage() == 0)//
|
||||
.forEach(tb -> tb.setPage(table.getPage()));
|
||||
}
|
||||
@ -82,28 +87,32 @@ public class TableNodeFactory {
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders().findAny().isEmpty()) {
|
||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
if (table.streamHeaders()
|
||||
.findAny().isEmpty()) {
|
||||
table.streamRow(0)
|
||||
.forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||
addTableCell(rows.get(rowIndex)
|
||||
.get(colIndex), rowIndex, colIndex, table, context, document);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
||||
.build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||
@ -113,16 +122,26 @@ public class TableNodeFactory {
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||
textBlock = context.getTextBlockFactory()
|
||||
.buildAtomicTextBlock(cell.getTextBlocks()
|
||||
.get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||
SectionNodeFactory.addSection(tableCell,
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
.toList(),
|
||||
emptyList(),
|
||||
context,
|
||||
document);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
cell.getTextBlocks()
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -135,7 +154,8 @@ public class TableNodeFactory {
|
||||
|
||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||
|
||||
return cell.getTextBlocks().get(0).isHeadline();
|
||||
return cell.getTextBlocks()
|
||||
.get(0).isHeadline();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user