diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 20440eb..e6abd2f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -49,9 +49,14 @@ public class DocumentGraphFactory { Document documentGraph = new Document(); Context context = new Context(documentGraph); - document.getPages().forEach(context::buildAndAddPageWithCounter); - document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image)); - addSections(document, context); + document.getPages() + .forEach(context::buildAndAddPageWithCounter); + document.getSections() + .stream() + .flatMap(section -> section.getImages() + .stream()) + .forEach(image -> context.getImages().add(image)); + addSections(document, context, documentGraph); addHeaderAndFooterToEachPage(document, context); documentGraph.setNumberOfPages(context.pages.size()); @@ -62,9 +67,10 @@ public class DocumentGraphFactory { } - private void addSections(ClassificationDocument document, Context context) { + private void addSections(ClassificationDocument classificationDocument, Context context, Document document) { - document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context)); + classificationDocument.getSections() + .forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context, document)); } @@ -74,9 +80,11 @@ public class DocumentGraphFactory { GenericSemanticNode node; if (originalTextBlock.isHeadline()) { - node = Headline.builder().documentTree(context.getDocumentTree()).build(); + node = Headline.builder().documentTree(context.getDocumentTree()) + .build(); } else { - node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); + node = Paragraph.builder().documentTree(context.getDocumentTree()) + .build(); } page.getMainBody().add(node); @@ -93,6 +101,22 @@ public class DocumentGraphFactory { public void addImage(Section section, ClassifiedImage image, Context context) { + Image imageNode = createImage(image, context); + List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); + imageNode.setTreeId(treeId); + } + + + public void addImage(Document document, ClassifiedImage image, Context context) { + + Image imageNode = createImage(image, context); + List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(document, imageNode); + imageNode.setTreeId(treeId); + } + + + private Image createImage(ClassifiedImage image, Context context) { + Rectangle2D position = image.getPosition(); Page page = context.getPage(image.getPage()); Image imageNode = Image.builder() @@ -104,9 +128,7 @@ public class DocumentGraphFactory { .documentTree(context.getDocumentTree()) .build(); page.getMainBody().add(imageNode); - - List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); - imageNode.setTreeId(treeId); + return imageNode; } @@ -145,11 +167,12 @@ public class DocumentGraphFactory { private void addFooter(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), - footer, - context, - page); + footer, + context, + page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -160,7 +183,8 @@ public class DocumentGraphFactory { public void addHeader(List textBlocks, Context context) { Page page = context.getPage(textBlocks.get(0).getPage()); - Header header = Header.builder().documentTree(context.getDocumentTree()).build(); + Header header = Header.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); @@ -172,7 +196,8 @@ public class DocumentGraphFactory { private void addEmptyFooter(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); @@ -184,7 +209,8 @@ public class DocumentGraphFactory { private void addEmptyHeader(int pageIndex, Context context) { Page page = context.getPage(pageIndex); - Header header = Header.builder().documentTree(context.getDocumentTree()).build(); + Header header = Header.builder().documentTree(context.getDocumentTree()) + .build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 7bd82e2..d1d5275 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -11,6 +11,7 @@ import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -24,27 +25,46 @@ import lombok.experimental.UtilityClass; @UtilityClass public class SectionNodeFactory { - public void addSection(GenericSemanticNode parentNode, List pageBlocks, List images, DocumentGraphFactory.Context context) { + public void addSection(GenericSemanticNode parentNode, + List pageBlocks, + List images, + DocumentGraphFactory.Context context, + Document document) { + + // This is for the case where we have images on a page without any text/footer/header. + // The pageBlocks list is empty, but we still need to add those images to the document. + if (!images.isEmpty() && pageBlocks.isEmpty()) { + images.stream() + .distinct() + .forEach(image -> DocumentGraphFactory.addImage(document, image, context)); + return; + } if (pageBlocks.isEmpty()) { return; } - Map> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().documentTree(context.getDocumentTree()).build(); + + Map> blocksPerPage = pageBlocks.stream() + .collect(groupingBy(AbstractPageBlock::getPage)); + Section section = Section.builder().documentTree(context.getDocumentTree()) + .build(); context.getSections().add(section); - blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber)); + blocksPerPage.keySet() + .forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber)); section.setTreeId(getTreeId(parentNode, context, section)); - addFirstHeadlineDirectlyToSection(pageBlocks, context, section); + addFirstHeadlineDirectlyToSection(pageBlocks, context, section, document); if (containsTablesAndTextBlocks(pageBlocks)) { - splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context)); + splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context, document)); } else { - addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section); + addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section, document); } - images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context)); + images.stream() + .distinct() + .forEach(image -> DocumentGraphFactory.addImage(section, image, context)); } @@ -58,16 +78,16 @@ public class SectionNodeFactory { } - private void addFirstHeadlineDirectlyToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section) { + private void addFirstHeadlineDirectlyToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) { if (pageBlocks.get(0).isHeadline()) { - addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section); + addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section, document); pageBlocks.remove(0); } } - private void addTablesAndParagraphsAndHeadlinesToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section) { + private void addTablesAndParagraphsAndHeadlinesToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) { Set alreadyMerged = new HashSet<>(); List remainingBlocks = new LinkedList<>(pageBlocks); @@ -86,7 +106,7 @@ public class SectionNodeFactory { } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); alreadyMerged.addAll(tablesToMerge); - TableNodeFactory.addTable(section, tablesToMerge, context); + TableNodeFactory.addTable(section, tablesToMerge, context, document); } else { throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass())); } @@ -96,7 +116,9 @@ public class SectionNodeFactory { private boolean containsTablesAndTextBlocks(List pageBlocks) { - return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock); + return pageBlocks.stream() + .anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream() + .anyMatch(pageBlock -> pageBlock instanceof TextPageBlock); } @@ -112,7 +134,9 @@ public class SectionNodeFactory { List> splitList = splitIntoCoherentList(pageBlocks); movePrecedingHeadlineToTableList(splitList); - return splitList.stream().filter(list -> !list.isEmpty()).toList(); + return splitList.stream() + .filter(list -> !list.isEmpty()) + .toList(); } @@ -133,7 +157,8 @@ public class SectionNodeFactory { private boolean listIsTablesOnly(List abstractPageBlocks) { - return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock); + return abstractPageBlocks.stream() + .allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index c00edd1..3ba9559 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -8,6 +8,7 @@ import java.util.Set; import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -27,23 +28,26 @@ public class TableNodeFactory { public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; - public void addTable(GenericSemanticNode parentNode, List tablesToMerge, DocumentGraphFactory.Context context) { + public void addTable(GenericSemanticNode parentNode, List tablesToMerge, DocumentGraphFactory.Context context, Document document) { setPageNumberInCells(tablesToMerge); - Set pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet()); - List> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList(); + Set pages = tablesToMerge.stream() + .map(AbstractPageBlock::getPage) + .map(context::getPage) + .collect(Collectors.toSet()); + List> mergedRows = tablesToMerge.stream() + .map(TablePageBlock::getRows) + .flatMap(Collection::stream) + .toList(); - Table table = Table.builder() - .documentTree(context.getDocumentTree()) - .numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()) - .numberOfRows(mergedRows.size()) + Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size()) .build(); pages.forEach(page -> addTableToPage(page, parentNode, table)); List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); table.setTreeId(treeId); - addTableCells(mergedRows, table, context); + addTableCells(mergedRows, table, context, document); ifTableHasNoHeadersSetFirstRowAsHeaders(table); } @@ -63,7 +67,8 @@ public class TableNodeFactory { private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) { - cell.getTextBlocks().stream()// + cell.getTextBlocks() + .stream()// .filter(tb -> tb.getPage() == 0)// .forEach(tb -> tb.setPage(table.getPage())); } @@ -82,28 +87,32 @@ public class TableNodeFactory { private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) { - if (table.streamHeaders().findAny().isEmpty()) { - table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true)); + if (table.streamHeaders() + .findAny().isEmpty()) { + table.streamRow(0) + .forEach(tableCellNode -> tableCellNode.setHeader(true)); } } - private void addTableCells(List> rows, Table table, DocumentGraphFactory.Context context) { + private void addTableCells(List> rows, Table table, DocumentGraphFactory.Context context, Document document) { for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { - addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context); + addTableCell(rows.get(rowIndex) + .get(colIndex), rowIndex, colIndex, table, context, document); } } } @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong - private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) { + private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context, Document document) { Page page = context.getPage(cell.getPageNumber()); - TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build(); + TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()) + .build(); page.getMainBody().add(tableCell); List treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell); @@ -113,16 +122,26 @@ public class TableNodeFactory { if (cell.getTextBlocks().isEmpty()) { tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); } else if (cell.getTextBlocks().size() == 1) { - textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); + textBlock = context.getTextBlockFactory() + .buildAtomicTextBlock(cell.getTextBlocks() + .get(0).getSequences(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else if (firstTextBlockIsHeadline(cell)) { - SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context); + SectionNodeFactory.addSection(tableCell, + cell.getTextBlocks() + .stream() + .map(tb -> (AbstractPageBlock) tb) + .toList(), + emptyList(), + context, + document); } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else { - cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); + cell.getTextBlocks() + .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); } } @@ -135,7 +154,8 @@ public class TableNodeFactory { private boolean firstTextBlockIsHeadline(Cell cell) { - return cell.getTextBlocks().get(0).isHeadline(); + return cell.getTextBlocks() + .get(0).isHeadline(); } }