RED-8773 - Fix images not appearing on specific file
This commit is contained in:
parent
8bd0de6263
commit
456b8fe4a1
@ -54,9 +54,14 @@ public class DocumentGraphFactory {
|
|||||||
Document documentGraph = new Document();
|
Document documentGraph = new Document();
|
||||||
Context context = new Context(documentGraph);
|
Context context = new Context(documentGraph);
|
||||||
|
|
||||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
document.getPages()
|
||||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
.forEach(context::buildAndAddPageWithCounter);
|
||||||
addSections(layoutParsingType, document, context);
|
document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(section -> section.getImages()
|
||||||
|
.stream())
|
||||||
|
.forEach(image -> context.getImages().add(image));
|
||||||
|
addSections(layoutParsingType, document, context, documentGraph);
|
||||||
addHeaderAndFooterToEachPage(document, context);
|
addHeaderAndFooterToEachPage(document, context);
|
||||||
|
|
||||||
documentGraph.setNumberOfPages(context.pages.size());
|
documentGraph.setNumberOfPages(context.pages.size());
|
||||||
@ -67,9 +72,10 @@ public class DocumentGraphFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) {
|
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||||
|
|
||||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context));
|
classificationDocument.getSections()
|
||||||
|
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -79,11 +85,14 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
GenericSemanticNode node;
|
GenericSemanticNode node;
|
||||||
if (originalTextBlock.isHeadline()) {
|
if (originalTextBlock.isHeadline()) {
|
||||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
node = Headline.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
} else if (originalTextBlock.isToDuplicate()) {
|
} else if (originalTextBlock.isToDuplicate()) {
|
||||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
} else {
|
} else {
|
||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
node = Paragraph.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
page.getMainBody().add(node);
|
page.getMainBody().add(node);
|
||||||
@ -96,8 +105,9 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
||||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
|
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
|
||||||
.flatMap(tb -> tb.getSequences().stream())
|
.flatMap(tb -> tb.getSequences()
|
||||||
.collect(Collectors.toList()), node, context, page);
|
.stream())
|
||||||
|
.collect(Collectors.toList()), node, context, page);
|
||||||
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
|
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,23 +119,34 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
public void addImage(Section section, ClassifiedImage image, Context context) {
|
||||||
|
|
||||||
|
Image imageNode = createImage(image, context);
|
||||||
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||||
|
imageNode.setTreeId(treeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addImage(Document document, ClassifiedImage image, Context context) {
|
||||||
|
|
||||||
|
Image imageNode = createImage(image, context);
|
||||||
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(document, imageNode);
|
||||||
|
imageNode.setTreeId(treeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Image createImage(ClassifiedImage image, Context context) {
|
||||||
|
|
||||||
Rectangle2D position = image.getPosition();
|
Rectangle2D position = image.getPosition();
|
||||||
Page page = context.getPage(image.getPage());
|
Page page = context.getPage(image.getPage());
|
||||||
var imageBuilder = Image.builder()
|
Image imageNode = Image.builder()
|
||||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||||
.imageType(image.getImageType())
|
.imageType(image.getImageType())
|
||||||
.position(position)
|
.position(position)
|
||||||
.transparent(image.isHasTransparency())
|
.transparent(image.isHasTransparency())
|
||||||
.page(page)
|
.page(page)
|
||||||
.documentTree(context.getDocumentTree());
|
.documentTree(context.getDocumentTree())
|
||||||
if (image.isSourceByAi()) {
|
.build();
|
||||||
imageBuilder.engines(new HashSet<>(Set.of(LayoutEngine.AI)));
|
|
||||||
}
|
|
||||||
Image imageNode = imageBuilder.build();
|
|
||||||
page.getMainBody().add(imageNode);
|
page.getMainBody().add(imageNode);
|
||||||
|
return imageNode;
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
|
||||||
imageNode.setTreeId(treeId);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -164,11 +185,12 @@ public class DocumentGraphFactory {
|
|||||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||||
footer,
|
footer,
|
||||||
context,
|
context,
|
||||||
page);
|
page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
footer.setTreeId(tocId);
|
footer.setTreeId(tocId);
|
||||||
footer.setLeafTextBlock(textBlock);
|
footer.setLeafTextBlock(textBlock);
|
||||||
@ -179,7 +201,8 @@ public class DocumentGraphFactory {
|
|||||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
header.setTreeId(tocId);
|
header.setTreeId(tocId);
|
||||||
@ -191,7 +214,8 @@ public class DocumentGraphFactory {
|
|||||||
private void addEmptyFooter(int pageIndex, Context context) {
|
private void addEmptyFooter(int pageIndex, Context context) {
|
||||||
|
|
||||||
Page page = context.getPage(pageIndex);
|
Page page = context.getPage(pageIndex);
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
footer.setTreeId(tocId);
|
footer.setTreeId(tocId);
|
||||||
@ -203,7 +227,8 @@ public class DocumentGraphFactory {
|
|||||||
private void addEmptyHeader(int pageIndex, Context context) {
|
private void addEmptyHeader(int pageIndex, Context context) {
|
||||||
|
|
||||||
Page page = context.getPage(pageIndex);
|
Page page = context.getPage(pageIndex);
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
header.setTreeId(tocId);
|
header.setTreeId(tocId);
|
||||||
|
|||||||
@ -13,6 +13,7 @@ import java.util.Set;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
@ -30,27 +31,48 @@ public class SectionNodeFactory {
|
|||||||
GenericSemanticNode parentNode,
|
GenericSemanticNode parentNode,
|
||||||
List<AbstractPageBlock> pageBlocks,
|
List<AbstractPageBlock> pageBlocks,
|
||||||
List<ClassifiedImage> images,
|
List<ClassifiedImage> images,
|
||||||
DocumentGraphFactory.Context context) {
|
DocumentGraphFactory.Context context,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
|
// This is for the case where we have images on a page without any text/footer/header.
|
||||||
|
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||||
|
if (!images.isEmpty() && pageBlocks.isEmpty()) {
|
||||||
|
images.stream()
|
||||||
|
.distinct()
|
||||||
|
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (pageBlocks.isEmpty()) {
|
if (pageBlocks.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
|
||||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||||
|
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||||
|
Section section = Section.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
|
|
||||||
context.getSections().add(section);
|
context.getSections().add(section);
|
||||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
blocksPerPage.keySet()
|
||||||
|
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||||
|
|
||||||
section.setTreeId(getTreeId(parentNode, context, section));
|
section.setTreeId(getTreeId(parentNode, context, section));
|
||||||
|
|
||||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
|
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context));
|
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||||
|
section,
|
||||||
|
subSectionPageBlocks,
|
||||||
|
emptyList(),
|
||||||
|
context,
|
||||||
|
document));
|
||||||
} else {
|
} else {
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section);
|
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||||
}
|
}
|
||||||
|
|
||||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
images.stream()
|
||||||
|
.distinct()
|
||||||
|
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -64,10 +86,14 @@ public class SectionNodeFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
||||||
|
List<AbstractPageBlock> pageBlocks,
|
||||||
|
DocumentGraphFactory.Context context,
|
||||||
|
Section section,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
if (pageBlocks.get(0).isHeadline()) {
|
if (pageBlocks.get(0).isHeadline()) {
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section);
|
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document);
|
||||||
pageBlocks.remove(0);
|
pageBlocks.remove(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -76,7 +102,8 @@ public class SectionNodeFactory {
|
|||||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||||
List<AbstractPageBlock> pageBlocks,
|
List<AbstractPageBlock> pageBlocks,
|
||||||
DocumentGraphFactory.Context context,
|
DocumentGraphFactory.Context context,
|
||||||
Section section) {
|
Section section,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||||
@ -105,7 +132,7 @@ public class SectionNodeFactory {
|
|||||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||||
alreadyMerged.addAll(tablesToMerge);
|
alreadyMerged.addAll(tablesToMerge);
|
||||||
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context);
|
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||||
}
|
}
|
||||||
@ -115,7 +142,9 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
return pageBlocks.stream()
|
||||||
|
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
|
||||||
|
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -131,7 +160,9 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||||
movePrecedingHeadlineToTableList(splitList);
|
movePrecedingHeadlineToTableList(splitList);
|
||||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
return splitList.stream()
|
||||||
|
.filter(list -> !list.isEmpty())
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -152,7 +183,8 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||||
|
|
||||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
return abstractPageBlocks.stream()
|
||||||
|
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
@ -28,23 +29,30 @@ public class TableNodeFactory {
|
|||||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||||
|
|
||||||
|
|
||||||
public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
public void addTable(LayoutParsingType layoutParsingType,
|
||||||
|
GenericSemanticNode parentNode,
|
||||||
|
List<TablePageBlock> tablesToMerge,
|
||||||
|
DocumentGraphFactory.Context context,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
setPageNumberInCells(tablesToMerge);
|
setPageNumberInCells(tablesToMerge);
|
||||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
Set<Page> pages = tablesToMerge.stream()
|
||||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
.map(AbstractPageBlock::getPage)
|
||||||
|
.map(context::getPage)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
List<List<Cell>> mergedRows = tablesToMerge.stream()
|
||||||
|
.map(TablePageBlock::getRows)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
|
|
||||||
Table table = Table.builder()
|
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
||||||
.documentTree(context.getDocumentTree())
|
|
||||||
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
|
||||||
.numberOfRows(mergedRows.size())
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||||
table.setTreeId(treeId);
|
table.setTreeId(treeId);
|
||||||
addTableCells(layoutParsingType, mergedRows, table, context);
|
addTableCells(layoutParsingType, mergedRows, table, context, document);
|
||||||
|
|
||||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||||
}
|
}
|
||||||
@ -64,7 +72,8 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||||
|
|
||||||
cell.getTextBlocks().stream()//
|
cell.getTextBlocks()
|
||||||
|
.stream()//
|
||||||
.filter(tb -> tb.getPage() == 0)//
|
.filter(tb -> tb.getPage() == 0)//
|
||||||
.forEach(tb -> tb.setPage(table.getPage()));
|
.forEach(tb -> tb.setPage(table.getPage()));
|
||||||
}
|
}
|
||||||
@ -83,28 +92,44 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||||
|
|
||||||
if (table.streamHeaders().findAny().isEmpty()) {
|
if (table.streamHeaders()
|
||||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
.findAny().isEmpty()) {
|
||||||
|
table.streamRow(0)
|
||||||
|
.forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
|
||||||
|
|
||||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||||
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
addTableCell(layoutParsingType,
|
||||||
|
rows.get(rowIndex)
|
||||||
|
.get(colIndex),
|
||||||
|
rowIndex,
|
||||||
|
colIndex,
|
||||||
|
table,
|
||||||
|
context,
|
||||||
|
document);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||||
private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
private void addTableCell(LayoutParsingType layoutParsingType,
|
||||||
|
Cell cell,
|
||||||
|
int rowIndex,
|
||||||
|
int colIndex,
|
||||||
|
Table tableNode,
|
||||||
|
DocumentGraphFactory.Context context,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
Page page = context.getPage(cell.getPageNumber());
|
Page page = context.getPage(cell.getPageNumber());
|
||||||
|
|
||||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
||||||
|
.build();
|
||||||
page.getMainBody().add(tableCell);
|
page.getMainBody().add(tableCell);
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||||
@ -114,16 +139,27 @@ public class TableNodeFactory {
|
|||||||
if (cell.getTextBlocks().isEmpty()) {
|
if (cell.getTextBlocks().isEmpty()) {
|
||||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||||
} else if (cell.getTextBlocks().size() == 1) {
|
} else if (cell.getTextBlocks().size() == 1) {
|
||||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
textBlock = context.getTextBlockFactory()
|
||||||
|
.buildAtomicTextBlock(cell.getTextBlocks()
|
||||||
|
.get(0).getSequences(), tableCell, context, page);
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
} else if (firstTextBlockIsHeadline(cell)) {
|
} else if (firstTextBlockIsHeadline(cell)) {
|
||||||
SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
SectionNodeFactory.addSection(layoutParsingType,
|
||||||
|
tableCell,
|
||||||
|
cell.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.map(tb -> (AbstractPageBlock) tb)
|
||||||
|
.toList(),
|
||||||
|
emptyList(),
|
||||||
|
context,
|
||||||
|
document);
|
||||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
} else {
|
} else {
|
||||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
cell.getTextBlocks()
|
||||||
|
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,7 +172,8 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||||
|
|
||||||
return cell.getTextBlocks().get(0).isHeadline();
|
return cell.getTextBlocks()
|
||||||
|
.get(0).isHeadline();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user