diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index 9efc286..be36feb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -133,7 +133,7 @@ public abstract class BoundingBox { } - private boolean intersectsX(BoundingBox other, float threshold) { + public boolean intersectsX(BoundingBox other, float threshold) { return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java index b82bee7..0a11ffe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java @@ -1,12 +1,15 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; +import java.util.Comparator; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; +import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; @@ -29,9 +32,8 @@ public class Page { Integer height; Integer width; Integer rotation; - @EqualsAndHashCode.Exclude - List mainBody; + List textBlocksOnPage; @EqualsAndHashCode.Exclude Header header; @EqualsAndHashCode.Exclude @@ -53,20 +55,43 @@ public class Page { .width((int) classificationPage.getPageWidth()) .number(classificationPage.getPageNumber()) .rotation(classificationPage.getRotation()) - .mainBody(new LinkedList<>()) + .textBlocksOnPage(new LinkedList<>()) .build(); } + /** + * Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body. + * + * @return The main body text block. + */ public TextBlock getMainBodyTextBlock() { - return mainBody.stream() - .filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) + return textBlocksOnPage.stream() .collect(new TextBlockCollector()); } + public List getMainBody() { + + return textBlocksOnPage.stream() + .map(AtomicTextBlock::getParent) + .map(this::getHighestParentOnPage) + .distinct() + .toList(); + } + + + private SemanticNode getHighestParentOnPage(SemanticNode node) { + + SemanticNode currentNode = node; + while (currentNode.getParent().onlyOnPage(this)) { + currentNode = currentNode.getParent(); + } + return currentNode; + } + + @Override public String toString() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index 9223cd8..98103bf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -74,7 +74,8 @@ public interface SemanticNode { return getTextBlock().getPages() .stream() - .min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); + .min(Comparator.comparingInt(Page::getNumber)) + .orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); } @@ -504,4 +505,17 @@ public interface SemanticNode { void accept(NodeVisitor visitor); + + /** + * Checks wether this SemanticNode appears on a single page only, and if that page is the provided one. + * + * @param page the page to check + * @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false. + */ + default boolean onlyOnPage(Page page) { + + Set pages = getPages(); + return pages.size() == 1 && pages.contains(page); + } + } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 113d55a..62958e2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -6,6 +6,7 @@ import static java.util.stream.Collectors.toList; import java.awt.geom.Rectangle2D; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -15,6 +16,7 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; @@ -32,7 +34,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -68,10 +72,25 @@ public class DocumentGraphFactory { documentGraph.setPages(context.pages.keySet()); documentGraph.setDocumentTree(context.documentTree); documentGraph.setTextBlock(documentGraph.getTextBlock()); + addTextBlocksToPages(documentGraph); + return documentGraph; } + private void addTextBlocksToPages(Document documentGraph) { + + documentGraph.streamAllSubNodes() + .filter(SemanticNode::isLeaf) + .filter(node -> !node.getType().equals(NodeType.HEADER)) + .filter(node -> !node.getType().equals(NodeType.FOOTER)) + .map(SemanticNode::getTextBlock) + .map(TextBlock::getAtomicTextBlocks) + .flatMap(Collection::stream) + .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb)); + } + + private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) { for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { @@ -105,8 +124,6 @@ public class DocumentGraphFactory { node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } - page.getMainBody().add(node); - List textBlocks = new ArrayList<>(); textBlocks.add(originalTextBlock); textBlocks.addAll(textBlocksToMerge); @@ -115,9 +132,9 @@ public class DocumentGraphFactory { if (node instanceof DuplicatedParagraph duplicatedParagraph) { AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream() - .flatMap(tb -> tb.getSequences() - .stream()) - .collect(Collectors.toList()), node, context, page); + .flatMap(tb -> tb.getSequences() + .stream()) + .collect(Collectors.toList()), node, context, page); duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); } @@ -141,7 +158,7 @@ public class DocumentGraphFactory { Rectangle2D position = image.getPosition(); Page page = context.getPage(image.getPage()); - Image imageNode = Image.builder() + return Image.builder() .id(IdBuilder.buildId(Set.of(page), List.of(position))) .imageType(image.getImageType()) .position(position) @@ -150,8 +167,6 @@ public class DocumentGraphFactory { .representationHash(image.getRepresentation()) .documentTree(context.getDocumentTree()) .build(); - page.getMainBody().add(imageNode); - return imageNode; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 507f8af..cf50cf1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -51,9 +51,6 @@ public class SectionNodeFactory { return Optional.empty(); } - Map> blocksPerPage = pageBlocks.stream() - .collect(groupingBy(AbstractPageBlock::getPage)); - AbstractSemanticNode section; boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks); if (isLeaf && !containsTablesAndTextBlocks) { @@ -63,8 +60,6 @@ public class SectionNodeFactory { } context.getSections().add(section); - blocksPerPage.keySet() - .forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber)); section.setTreeId(getTreeId(parentNode, context, section)); @@ -242,10 +237,5 @@ public class SectionNodeFactory { } - private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) { - - Page page = context.getPage(pageNumber); - page.getMainBody().add(section); - } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index 1060a68..e14075e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -51,8 +51,6 @@ public class TableNodeFactory { .numberOfRows(mergedRows.size()) .build(); - pages.forEach(page -> addTableToPage(page, parentNode, table)); - List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); table.setTreeId(treeId); addTableCells(layoutParsingType, mergedRows, table, context, document); @@ -82,17 +80,6 @@ public class TableNodeFactory { } - @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong - private void addTableToPage(Page page, SemanticNode parentNode, Table table) { - - if (!page.getMainBody().contains(parentNode)) { - parentNode.getPages().add(page); - } - - page.getMainBody().add(table); - } - - private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) { if (table.streamHeaders() @@ -107,14 +94,7 @@ public class TableNodeFactory { for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { - addTableCell(layoutParsingType, - rows.get(rowIndex) - .get(colIndex), - rowIndex, - colIndex, - table, - context, - document); + addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context, document); } } } @@ -131,14 +111,7 @@ public class TableNodeFactory { Page page = context.getPage(cell.getPageNumber()); - TableCell tableCell = TableCell.builder() - .documentTree(context.getDocumentTree()) - .row(rowIndex) - .col(colIndex) - .header(cell.isHeaderCell()) - .bBox(cell.getBBoxPdf()) - .build(); - page.getMainBody().add(tableCell); + TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBBoxPdf()).build(); List treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell); tableCell.setTreeId(treeId); @@ -147,9 +120,7 @@ public class TableNodeFactory { if (cell.getTextBlocks().isEmpty()) { tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); } else if (cell.getTextBlocks().size() == 1) { - textBlock = context.getTextBlockFactory() - .buildAtomicTextBlock2(cell.getTextBlocks() - .get(0).getSequences(), tableCell, context, page); + textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else if (firstTextBlockIsHeadline(cell)) { SectionNodeFactory.addSection(layoutParsingType, @@ -181,8 +152,7 @@ public class TableNodeFactory { private boolean firstTextBlockIsHeadline(Cell cell) { - return cell.getTextBlocks() - .get(0).isHeadline(); + return cell.getTextBlocks().get(0).isHeadline(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java index 5357fce..01db14c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java @@ -40,27 +40,26 @@ public class TextBlockFactory { orientation = sequences.get(0).getDir().toString(); textRotation = sequences.get(0).getDir().getRotation(); } - return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), - searchTextWithTextPositionDto.getLineBreaks(), - searchTextWithTextPositionDto.getBoldTextBoundaries(), - searchTextWithTextPositionDto.getItalicTextBoundaries(), - searchTextWithTextPositionDto.getPositions(), - searchTextWithTextPositionDto.getStringIdxToPositionIdx(), - idx, - parent, - numberOnPage, - page, - offset, - orientation, - textRotation); + var atb = AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), + searchTextWithTextPositionDto.getLineBreaks(), + searchTextWithTextPositionDto.getBoldTextBoundaries(), + searchTextWithTextPositionDto.getItalicTextBoundaries(), + searchTextWithTextPositionDto.getPositions(), + searchTextWithTextPositionDto.getStringIdxToPositionIdx(), + idx, + parent, + numberOnPage, + page, + offset, + orientation, + textRotation); + return atb; } public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) { - long idx = textBlockIdx; - textBlockIdx++; - return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent); + return emptyTextBlock(parent, context.getAndIncrementTextBlockNumberOnPage(page), page); } @@ -68,7 +67,8 @@ public class TextBlockFactory { long idx = textBlockIdx; textBlockIdx++; - return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent); + var atb = AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent); + return atb; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java index 2b7e087..766de14 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java @@ -41,7 +41,9 @@ public class DocumentGraphMapper { DocumentTree documentTree = new DocumentTree(document); Context context = new Context(documentData, documentTree); - context.pages.addAll(Arrays.stream(documentData.getDocumentPages()).map(DocumentGraphMapper::buildPage).toList()); + context.pages.addAll(Arrays.stream(documentData.getDocumentPages()) + .map(DocumentGraphMapper::buildPage) + .toList()); context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context)); @@ -59,7 +61,9 @@ public class DocumentGraphMapper { List newEntries = new LinkedList<>(); for (DocumentStructure.EntryData entryData : entries) { - List pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList(); + List pages = Arrays.stream(entryData.getPageNumbers()) + .map(pageNumber -> getPage(pageNumber, context)) + .toList(); SemanticNode node = switch (entryData.getType()) { case SECTION -> buildSection(context); @@ -77,16 +81,17 @@ public class DocumentGraphMapper { if (entryData.getAtomicBlockIds().length > 0) { TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node); node.setLeafTextBlock(textBlock); + switch (entryData.getType()) { + case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); + case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); + default -> textBlock.getAtomicTextBlocks() + .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb)); + } } - List treeId = Arrays.stream(entryData.getTreeId()).boxed().toList(); + List treeId = Arrays.stream(entryData.getTreeId()).boxed() + .toList(); node.setTreeId(treeId); - switch (entryData.getType()) { - case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); - case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); - default -> pages.forEach(page -> page.getMainBody().add(node)); - } - newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build()); } return newEntries; @@ -142,6 +147,7 @@ public class DocumentGraphMapper { return Section.builder().documentTree(context.documentTree).build(); } + private SuperSection buildSuperSection(Context context) { return SuperSection.builder().documentTree(context.documentTree).build(); @@ -166,22 +172,24 @@ public class DocumentGraphMapper { private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { - return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector()); + return Arrays.stream(atomicTextBlockIds) + .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)) + .collect(new TextBlockCollector()); } private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)), - context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), - parent, - getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); + context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), + parent, + getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); } private Page buildPage(DocumentPage p) { - return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build(); + return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build(); } @@ -206,8 +214,10 @@ public class DocumentGraphMapper { this.documentTree = documentTree; this.pages = new LinkedList<>(); - this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList(); - this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList(); + this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()) + .toList(); + this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()) + .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java index fe15845..3f9a92b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java @@ -1,9 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; import java.util.LinkedList; import java.util.List; -import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; @@ -22,29 +23,83 @@ public class TableMergingUtility { List consecutiveTables = pageBlocks.stream() .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock)) + .sorted(Comparator.comparingInt(TablePageBlock::getPage).thenComparing(TablePageBlock::getY).thenComparing(TablePageBlock::getX)) .toList(); + assert consecutiveTables.size() == pageBlocks.size() - 1; + var currentTable = originalTablePageBlock; + int currentTableIndex = 0; List consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>(); - for (TablePageBlock consecutiveTable : consecutiveTables) { - if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock, - consecutiveTable)) { + consecutiveTablesWithSameColCountAndHeaders.add(originalTablePageBlock); + for (int i = 0; i < consecutiveTables.size(); i++) { + TablePageBlock consecutiveTable = consecutiveTables.get(i); + + if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() // + && headersMatch(originalTablePageBlock, consecutiveTable) // + && outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) // + && consecutiveOrSamePage(currentTable, consecutiveTable) // + && !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) { + + currentTable = consecutiveTable; + currentTableIndex = i; consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable); } } - return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList(); + return consecutiveTablesWithSameColCountAndHeaders; + } + + + private static List findTablesBetween(List consecutiveTables, int currentTableIndex, int i) { + + if (currentTableIndex + 1 == consecutiveTables.size() || currentTableIndex + 1 >= i) { + return Collections.emptyList(); + } + return consecutiveTables.subList(currentTableIndex + 1, i); + } + + + private static boolean consecutiveOrSamePage(TablePageBlock currentTable, TablePageBlock consecutiveTable) { + + return currentTable.getPage() == consecutiveTable.getPage() || currentTable.getPage() + 1 == consecutiveTable.getPage(); + } + + + private static boolean tableBetween(TablePageBlock currentTable, TablePageBlock consecutiveTable, List tablesBetween) { + + if (tablesBetween.isEmpty()) { + return false; + } + // assumes the tables are on the same page or on consecutive pages, all tables on pages in between are ignored. + return tablesBetween.stream() + .filter(tableBetween -> tableBetween.getPage() == currentTable.getPage()) + .anyMatch(tableBetween -> tableBetween.isBelow(currentTable)) // + || tablesBetween.stream() + .filter(tableBetween -> tableBetween.getPage() == consecutiveTable.getPage()) + .anyMatch(tableBetween -> tableBetween.isAbove(consecutiveTable)); + } + + + private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) { + + return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable)); } private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) { - return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD; + return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD + && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD; } - private boolean hasTableHeader(TablePageBlock table) { + private List getHeaders(TablePageBlock table) { - return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell); + return table.getRows() + .stream() + .flatMap(Collection::stream) + .filter(Cell::isHeaderCell) + .toList(); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index ce1a37a..6a4c74a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class LayoutparserEnd2EndTest extends AbstractTest { - public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE; + public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD; @Autowired private LayoutParsingPipeline layoutParsingPipeline; @@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Disabled public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf"; + String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327/560e6ab1ab4754b9a62fd2e6d4d71327.ORIGIN.pdf"; runForFile(filePath); }