RED-9964: don't merge tables on non-consecutive pages
This commit is contained in:
parent
cf39d4dfcc
commit
c5178ea5c2
@ -133,7 +133,7 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
private boolean intersectsX(BoundingBox other, float threshold) {
|
||||
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
||||
}
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -29,9 +32,8 @@ public class Page {
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
List<SemanticNode> mainBody;
|
||||
List<AtomicTextBlock> textBlocksOnPage;
|
||||
@EqualsAndHashCode.Exclude
|
||||
Header header;
|
||||
@EqualsAndHashCode.Exclude
|
||||
@ -53,20 +55,43 @@ public class Page {
|
||||
.width((int) classificationPage.getPageWidth())
|
||||
.number(classificationPage.getPageNumber())
|
||||
.rotation(classificationPage.getRotation())
|
||||
.mainBody(new LinkedList<>())
|
||||
.textBlocksOnPage(new LinkedList<>())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
|
||||
*
|
||||
* @return The main body text block.
|
||||
*/
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream()
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
return textBlocksOnPage.stream()
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<SemanticNode> getMainBody() {
|
||||
|
||||
return textBlocksOnPage.stream()
|
||||
.map(AtomicTextBlock::getParent)
|
||||
.map(this::getHighestParentOnPage)
|
||||
.distinct()
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private SemanticNode getHighestParentOnPage(SemanticNode node) {
|
||||
|
||||
SemanticNode currentNode = node;
|
||||
while (currentNode.getParent().onlyOnPage(this)) {
|
||||
currentNode = currentNode.getParent();
|
||||
}
|
||||
return currentNode;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
|
||||
@ -74,7 +74,8 @@ public interface SemanticNode {
|
||||
|
||||
return getTextBlock().getPages()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
}
|
||||
|
||||
|
||||
@ -504,4 +505,17 @@ public interface SemanticNode {
|
||||
|
||||
void accept(NodeVisitor visitor);
|
||||
|
||||
|
||||
/**
|
||||
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
|
||||
*
|
||||
* @param page the page to check
|
||||
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
|
||||
*/
|
||||
default boolean onlyOnPage(Page page) {
|
||||
|
||||
Set<Page> pages = getPages();
|
||||
return pages.size() == 1 && pages.contains(page);
|
||||
}
|
||||
|
||||
}
|
||||
@ -6,6 +6,7 @@ import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -15,6 +16,7 @@ import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
@ -32,7 +34,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -68,10 +72,25 @@ public class DocumentGraphFactory {
|
||||
documentGraph.setPages(context.pages.keySet());
|
||||
documentGraph.setDocumentTree(context.documentTree);
|
||||
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
||||
addTextBlocksToPages(documentGraph);
|
||||
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private void addTextBlocksToPages(Document documentGraph) {
|
||||
|
||||
documentGraph.streamAllSubNodes()
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.filter(node -> !node.getType().equals(NodeType.HEADER))
|
||||
.filter(node -> !node.getType().equals(NodeType.FOOTER))
|
||||
.map(SemanticNode::getTextBlock)
|
||||
.map(TextBlock::getAtomicTextBlocks)
|
||||
.flatMap(Collection::stream)
|
||||
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||
}
|
||||
|
||||
|
||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
@ -105,8 +124,6 @@ public class DocumentGraphFactory {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
@ -115,9 +132,9 @@ public class DocumentGraphFactory {
|
||||
|
||||
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream()
|
||||
.flatMap(tb -> tb.getSequences()
|
||||
.stream())
|
||||
.collect(Collectors.toList()), node, context, page);
|
||||
.flatMap(tb -> tb.getSequences()
|
||||
.stream())
|
||||
.collect(Collectors.toList()), node, context, page);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
|
||||
}
|
||||
|
||||
@ -141,7 +158,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Rectangle2D position = image.getPosition();
|
||||
Page page = context.getPage(image.getPage());
|
||||
Image imageNode = Image.builder()
|
||||
return Image.builder()
|
||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||
.imageType(image.getImageType())
|
||||
.position(position)
|
||||
@ -150,8 +167,6 @@ public class DocumentGraphFactory {
|
||||
.representationHash(image.getRepresentation())
|
||||
.documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
page.getMainBody().add(imageNode);
|
||||
return imageNode;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -51,9 +51,6 @@ public class SectionNodeFactory {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
|
||||
AbstractSemanticNode section;
|
||||
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
|
||||
if (isLeaf && !containsTablesAndTextBlocks) {
|
||||
@ -63,8 +60,6 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet()
|
||||
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
@ -242,10 +237,5 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) {
|
||||
|
||||
Page page = context.getPage(pageNumber);
|
||||
page.getMainBody().add(section);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -51,8 +51,6 @@ public class TableNodeFactory {
|
||||
.numberOfRows(mergedRows.size())
|
||||
.build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(layoutParsingType, mergedRows, table, context, document);
|
||||
@ -82,17 +80,6 @@ public class TableNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
|
||||
|
||||
if (!page.getMainBody().contains(parentNode)) {
|
||||
parentNode.getPages().add(page);
|
||||
}
|
||||
|
||||
page.getMainBody().add(table);
|
||||
}
|
||||
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders()
|
||||
@ -107,14 +94,7 @@ public class TableNodeFactory {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(layoutParsingType,
|
||||
rows.get(rowIndex)
|
||||
.get(colIndex),
|
||||
rowIndex,
|
||||
colIndex,
|
||||
table,
|
||||
context,
|
||||
document);
|
||||
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context, document);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -131,14 +111,7 @@ public class TableNodeFactory {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.row(rowIndex)
|
||||
.col(colIndex)
|
||||
.header(cell.isHeaderCell())
|
||||
.bBox(cell.getBBoxPdf())
|
||||
.build();
|
||||
page.getMainBody().add(tableCell);
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBBoxPdf()).build();
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||
tableCell.setTreeId(treeId);
|
||||
@ -147,9 +120,7 @@ public class TableNodeFactory {
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory()
|
||||
.buildAtomicTextBlock2(cell.getTextBlocks()
|
||||
.get(0).getSequences(), tableCell, context, page);
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(layoutParsingType,
|
||||
@ -181,8 +152,7 @@ public class TableNodeFactory {
|
||||
|
||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||
|
||||
return cell.getTextBlocks()
|
||||
.get(0).isHeadline();
|
||||
return cell.getTextBlocks().get(0).isHeadline();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -40,27 +40,26 @@ public class TextBlockFactory {
|
||||
orientation = sequences.get(0).getDir().toString();
|
||||
textRotation = sequences.get(0).getDir().getRotation();
|
||||
}
|
||||
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
||||
searchTextWithTextPositionDto.getLineBreaks(),
|
||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getPositions(),
|
||||
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
||||
idx,
|
||||
parent,
|
||||
numberOnPage,
|
||||
page,
|
||||
offset,
|
||||
orientation,
|
||||
textRotation);
|
||||
var atb = AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
||||
searchTextWithTextPositionDto.getLineBreaks(),
|
||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getPositions(),
|
||||
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
||||
idx,
|
||||
parent,
|
||||
numberOnPage,
|
||||
page,
|
||||
offset,
|
||||
orientation,
|
||||
textRotation);
|
||||
return atb;
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
|
||||
return emptyTextBlock(parent, context.getAndIncrementTextBlockNumberOnPage(page), page);
|
||||
}
|
||||
|
||||
|
||||
@ -68,7 +67,8 @@ public class TextBlockFactory {
|
||||
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
||||
var atb = AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
||||
return atb;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -41,7 +41,9 @@ public class DocumentGraphMapper {
|
||||
DocumentTree documentTree = new DocumentTree(document);
|
||||
Context context = new Context(documentData, documentTree);
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getDocumentPages()).map(DocumentGraphMapper::buildPage).toList());
|
||||
context.pages.addAll(Arrays.stream(documentData.getDocumentPages())
|
||||
.map(DocumentGraphMapper::buildPage)
|
||||
.toList());
|
||||
|
||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
|
||||
|
||||
@ -59,7 +61,9 @@ public class DocumentGraphMapper {
|
||||
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
||||
for (DocumentStructure.EntryData entryData : entries) {
|
||||
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||
.map(pageNumber -> getPage(pageNumber, context))
|
||||
.toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
@ -77,16 +81,17 @@ public class DocumentGraphMapper {
|
||||
if (entryData.getAtomicBlockIds().length > 0) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||
default -> textBlock.getAtomicTextBlocks()
|
||||
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||
}
|
||||
}
|
||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
|
||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
||||
.toList();
|
||||
node.setTreeId(treeId);
|
||||
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
|
||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
@ -142,6 +147,7 @@ public class DocumentGraphMapper {
|
||||
return Section.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private SuperSection buildSuperSection(Context context) {
|
||||
|
||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||
@ -166,22 +172,24 @@ public class DocumentGraphMapper {
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
}
|
||||
|
||||
|
||||
private Page buildPage(DocumentPage p) {
|
||||
|
||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
@ -206,8 +214,10 @@ public class DocumentGraphMapper {
|
||||
|
||||
this.documentTree = documentTree;
|
||||
this.pages = new LinkedList<>();
|
||||
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
|
||||
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
|
||||
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData())
|
||||
.toList();
|
||||
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions())
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
@ -22,29 +23,83 @@ public class TableMergingUtility {
|
||||
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock))
|
||||
.sorted(Comparator.comparingInt(TablePageBlock::getPage).thenComparing(TablePageBlock::getY).thenComparing(TablePageBlock::getX))
|
||||
.toList();
|
||||
|
||||
assert consecutiveTables.size() == pageBlocks.size() - 1;
|
||||
var currentTable = originalTablePageBlock;
|
||||
int currentTableIndex = 0;
|
||||
|
||||
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
|
||||
for (TablePageBlock consecutiveTable : consecutiveTables) {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
||||
consecutiveTable)) {
|
||||
consecutiveTablesWithSameColCountAndHeaders.add(originalTablePageBlock);
|
||||
for (int i = 0; i < consecutiveTables.size(); i++) {
|
||||
TablePageBlock consecutiveTable = consecutiveTables.get(i);
|
||||
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() //
|
||||
&& headersMatch(originalTablePageBlock, consecutiveTable) //
|
||||
&& outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) //
|
||||
&& consecutiveOrSamePage(currentTable, consecutiveTable) //
|
||||
&& !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) {
|
||||
|
||||
currentTable = consecutiveTable;
|
||||
currentTableIndex = i;
|
||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||
}
|
||||
}
|
||||
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
|
||||
return consecutiveTablesWithSameColCountAndHeaders;
|
||||
}
|
||||
|
||||
|
||||
private static List<TablePageBlock> findTablesBetween(List<TablePageBlock> consecutiveTables, int currentTableIndex, int i) {
|
||||
|
||||
if (currentTableIndex + 1 == consecutiveTables.size() || currentTableIndex + 1 >= i) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return consecutiveTables.subList(currentTableIndex + 1, i);
|
||||
}
|
||||
|
||||
|
||||
private static boolean consecutiveOrSamePage(TablePageBlock currentTable, TablePageBlock consecutiveTable) {
|
||||
|
||||
return currentTable.getPage() == consecutiveTable.getPage() || currentTable.getPage() + 1 == consecutiveTable.getPage();
|
||||
}
|
||||
|
||||
|
||||
private static boolean tableBetween(TablePageBlock currentTable, TablePageBlock consecutiveTable, List<TablePageBlock> tablesBetween) {
|
||||
|
||||
if (tablesBetween.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
// assumes the tables are on the same page or on consecutive pages, all tables on pages in between are ignored.
|
||||
return tablesBetween.stream()
|
||||
.filter(tableBetween -> tableBetween.getPage() == currentTable.getPage())
|
||||
.anyMatch(tableBetween -> tableBetween.isBelow(currentTable)) //
|
||||
|| tablesBetween.stream()
|
||||
.filter(tableBetween -> tableBetween.getPage() == consecutiveTable.getPage())
|
||||
.anyMatch(tableBetween -> tableBetween.isAbove(consecutiveTable));
|
||||
}
|
||||
|
||||
|
||||
private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) {
|
||||
|
||||
return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable));
|
||||
}
|
||||
|
||||
|
||||
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
|
||||
|
||||
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
|
||||
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD
|
||||
&& Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasTableHeader(TablePageBlock table) {
|
||||
private List<Cell> getHeaders(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(Cell::isHeaderCell)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
|
||||
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;
|
||||
|
||||
@Autowired
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327/560e6ab1ab4754b9a62fd2e6d4d71327.ORIGIN.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user