From 456b8fe4a139e7d6cff2ac450c627bcbee8a2e76 Mon Sep 17 00:00:00 2001 From: Andrei Isvoran Date: Wed, 3 Apr 2024 10:20:46 +0300 Subject: [PATCH] RED-8773 - Fix images not appearing on specific file --- .../factory/DocumentGraphFactory.java | 77 ++++++++++++------- .../services/factory/SectionNodeFactory.java | 62 +++++++++++---- .../services/factory/TableNodeFactory.java | 75 +++++++++++++----- 3 files changed, 154 insertions(+), 60 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index f402c8b..d0cee88 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -54,9 +54,14 @@ public class DocumentGraphFactory { Document documentGraph = new Document(); Context context = new Context(documentGraph); - document.getPages().forEach(context::buildAndAddPageWithCounter); - document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image)); - addSections(layoutParsingType, document, context); + document.getPages() + .forEach(context::buildAndAddPageWithCounter); + document.getSections() + .stream() + .flatMap(section -> section.getImages() + .stream()) + .forEach(image -> context.getImages().add(image)); + addSections(layoutParsingType, document, context, documentGraph); addHeaderAndFooterToEachPage(document, context); documentGraph.setNumberOfPages(context.pages.size()); @@ -67,9 +72,10 @@ public class DocumentGraphFactory { } - private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) { + private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) { - document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context)); + classificationDocument.getSections() + .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document)); } @@ -79,11 +85,14 @@ public class DocumentGraphFactory { GenericSemanticNode node; if (originalTextBlock.isHeadline()) { - node = Headline.builder().documentTree(context.getDocumentTree()).build(); + node = Headline.builder().documentTree(context.getDocumentTree()) + .build(); } else if (originalTextBlock.isToDuplicate()) { - node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); + node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()) + .build(); } else { - node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); + node = Paragraph.builder().documentTree(context.getDocumentTree()) + .build(); } page.getMainBody().add(node); @@ -96,8 +105,9 @@ public class DocumentGraphFactory { if (node instanceof DuplicatedParagraph duplicatedParagraph) { AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() - .flatMap(tb -> tb.getSequences().stream()) - .collect(Collectors.toList()), node, context, page); + .flatMap(tb -> tb.getSequences() + .stream()) + .collect(Collectors.toList()), node, context, page); duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); } @@ -109,23 +119,34 @@ public class DocumentGraphFactory { public void addImage(Section section, ClassifiedImage image, Context context) { + Image imageNode = createImage(image, context); + List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); + imageNode.setTreeId(treeId); + } + + + public void addImage(Document document, ClassifiedImage image, Context context) { + + Image imageNode = createImage(image, context); + List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(document, imageNode); + imageNode.setTreeId(treeId); + } + + + private Image createImage(ClassifiedImage image, Context context) { + Rectangle2D position = image.getPosition(); Page page = context.getPage(image.getPage()); - var imageBuilder = Image.builder() + Image imageNode = Image.builder() .id(IdBuilder.buildId(Set.of(page), List.of(position))) .imageType(image.getImageType()) .position(position) .transparent(image.isHasTransparency()) .page(page) - .documentTree(context.getDocumentTree()); - if (image.isSourceByAi()) { - imageBuilder.engines(new HashSet<>(Set.of(LayoutEngine.AI))); - } - Image imageNode = imageBuilder.build(); + .documentTree(context.getDocumentTree()) + .build(); page.getMainBody().add(imageNode); - - List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); - imageNode.setTreeId(treeId); + return imageNode; } @@ -164,11 +185,12 @@ public class DocumentGraphFactory { private void addFooter(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), - footer, - context, - page); + footer, + context, + page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -179,7 +201,8 @@ public class DocumentGraphFactory { public void addHeader(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Header header = Header.builder().documentTree(context.getDocumentTree()).build(); + Header header = Header.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); @@ -191,7 +214,8 @@ public class DocumentGraphFactory { private void addEmptyFooter(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); @@ -203,7 +227,8 @@ public class DocumentGraphFactory { private void addEmptyHeader(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Header header = Header.builder().documentTree(context.getDocumentTree()).build(); + Header header = Header.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 31c723d..f4b26eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -13,6 +13,7 @@ import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; @@ -30,27 +31,48 @@ public class SectionNodeFactory { GenericSemanticNode parentNode, List pageBlocks, List images, - DocumentGraphFactory.Context context) { + DocumentGraphFactory.Context context, + Document document) { + + // This is for the case where we have images on a page without any text/footer/header. + // The pageBlocks list is empty, but we still need to add those images to the document. + if (!images.isEmpty() && pageBlocks.isEmpty()) { + images.stream() + .distinct() + .forEach(image -> DocumentGraphFactory.addImage(document, image, context)); + return; + } if (pageBlocks.isEmpty()) { return; } - Map> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().documentTree(context.getDocumentTree()).build(); + + Map> blocksPerPage = pageBlocks.stream() + .collect(groupingBy(AbstractPageBlock::getPage)); + Section section = Section.builder().documentTree(context.getDocumentTree()) + .build(); context.getSections().add(section); - blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber)); + blocksPerPage.keySet() + .forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber)); section.setTreeId(getTreeId(parentNode, context, section)); - addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section); + addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); if (containsTablesAndTextBlocks(pageBlocks)) { - splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context)); + splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, + section, + subSectionPageBlocks, + emptyList(), + context, + document)); } else { - addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section); + addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); } - images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context)); + images.stream() + .distinct() + .forEach(image -> DocumentGraphFactory.addImage(section, image, context)); } @@ -64,10 +86,14 @@ public class SectionNodeFactory { } - private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List pageBlocks, DocumentGraphFactory.Context context, Section section) { + private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, + List pageBlocks, + DocumentGraphFactory.Context context, + Section section, + Document document) { if (pageBlocks.get(0).isHeadline()) { - addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section); + addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document); pageBlocks.remove(0); } } @@ -76,7 +102,8 @@ public class SectionNodeFactory { private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType, List pageBlocks, DocumentGraphFactory.Context context, - Section section) { + Section section, + Document document) { Set alreadyMerged = new HashSet<>(); List remainingBlocks = new LinkedList<>(pageBlocks); @@ -105,7 +132,7 @@ public class SectionNodeFactory { } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); alreadyMerged.addAll(tablesToMerge); - TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context); + TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document); } else { throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass())); } @@ -115,7 +142,9 @@ public class SectionNodeFactory { private boolean containsTablesAndTextBlocks(List pageBlocks) { - return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock); + return pageBlocks.stream() + .anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream() + .anyMatch(pageBlock -> pageBlock instanceof TextPageBlock); } @@ -131,7 +160,9 @@ public class SectionNodeFactory { List> splitList = splitIntoCoherentList(pageBlocks); movePrecedingHeadlineToTableList(splitList); - return splitList.stream().filter(list -> !list.isEmpty()).toList(); + return splitList.stream() + .filter(list -> !list.isEmpty()) + .toList(); } @@ -152,7 +183,8 @@ public class SectionNodeFactory { private boolean listIsTablesOnly(List abstractPageBlocks) { - return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock); + return abstractPageBlocks.stream() + .allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index 21d05fd..f71669c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -9,6 +9,7 @@ import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; @@ -28,23 +29,30 @@ public class TableNodeFactory { public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; - public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List tablesToMerge, DocumentGraphFactory.Context context) { + public void addTable(LayoutParsingType layoutParsingType, + GenericSemanticNode parentNode, + List tablesToMerge, + DocumentGraphFactory.Context context, + Document document) { setPageNumberInCells(tablesToMerge); - Set pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet()); - List> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList(); + Set pages = tablesToMerge.stream() + .map(AbstractPageBlock::getPage) + .map(context::getPage) + .collect(Collectors.toSet()); + List> mergedRows = tablesToMerge.stream() + .map(TablePageBlock::getRows) + .flatMap(Collection::stream) + .toList(); - Table table = Table.builder() - .documentTree(context.getDocumentTree()) - .numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()) - .numberOfRows(mergedRows.size()) + Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size()) .build(); pages.forEach(page -> addTableToPage(page, parentNode, table)); List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); table.setTreeId(treeId); - addTableCells(layoutParsingType, mergedRows, table, context); + addTableCells(layoutParsingType, mergedRows, table, context, document); ifTableHasNoHeadersSetFirstRowAsHeaders(table); } @@ -64,7 +72,8 @@ public class TableNodeFactory { private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) { - cell.getTextBlocks().stream()// + cell.getTextBlocks() + .stream()// .filter(tb -> tb.getPage() == 0)// .forEach(tb -> tb.setPage(table.getPage())); } @@ -83,28 +92,44 @@ public class TableNodeFactory { private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) { - if (table.streamHeaders().findAny().isEmpty()) { - table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true)); + if (table.streamHeaders() + .findAny().isEmpty()) { + table.streamRow(0) + .forEach(tableCellNode -> tableCellNode.setHeader(true)); } } - private void addTableCells(LayoutParsingType layoutParsingType, List> rows, Table table, DocumentGraphFactory.Context context) { + private void addTableCells(LayoutParsingType layoutParsingType, List> rows, Table table, DocumentGraphFactory.Context context, Document document) { for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { - addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context); + addTableCell(layoutParsingType, + rows.get(rowIndex) + .get(colIndex), + rowIndex, + colIndex, + table, + context, + document); } } } @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong - private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) { + private void addTableCell(LayoutParsingType layoutParsingType, + Cell cell, + int rowIndex, + int colIndex, + Table tableNode, + DocumentGraphFactory.Context context, + Document document) { Page page = context.getPage(cell.getPageNumber()); - TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build(); + TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()) + .build(); page.getMainBody().add(tableCell); List treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell); @@ -114,16 +139,27 @@ public class TableNodeFactory { if (cell.getTextBlocks().isEmpty()) { tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); } else if (cell.getTextBlocks().size() == 1) { - textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); + textBlock = context.getTextBlockFactory() + .buildAtomicTextBlock(cell.getTextBlocks() + .get(0).getSequences(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else if (firstTextBlockIsHeadline(cell)) { - SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context); + SectionNodeFactory.addSection(layoutParsingType, + tableCell, + cell.getTextBlocks() + .stream() + .map(tb -> (AbstractPageBlock) tb) + .toList(), + emptyList(), + context, + document); } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else { - cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); + cell.getTextBlocks() + .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); } } @@ -136,7 +172,8 @@ public class TableNodeFactory { private boolean firstTextBlockIsHeadline(Cell cell) { - return cell.getTextBlocks().get(0).isHeadline(); + return cell.getTextBlocks() + .get(0).isHeadline(); } }