Merge branch 'RED-9964' into 'release/0.159.x'

Red 9964: don't merge tables on non-consecutive pages or with tables in between See merge request fforesight/layout-parser!204
Red 9964: don't merge tables on non-consecutive pages or with tables in between
2024-08-30 14:00:50 +02:00 · 2024-08-30 14:00:50 +02:00 · 2024-08-30 10:52:31 +02:00 · 2024-08-30 10:35:24 +02:00 · 2024-08-29 14:26:14 +02:00 · 2024-08-29 14:17:20 +02:00
13 changed files with 296 additions and 118 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -119,14 +119,18 @@ public class LayoutParsingPipeline {
        log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());

        File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
-        File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
+        File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
+                .orElse(originFile);

        VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
-                .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
+                .map(layoutParsingStorageService::getVisualLayoutParsingFile)
+                .orElse(new VisualLayoutParsingResponse());
        ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
-                .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
+                .map(layoutParsingStorageService::getImagesFile)
+                .orElse(new ImageServiceResponse());
        TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
-                .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
+                .map(layoutParsingStorageService::getTablesFile)
+                .orElse(new TableServiceResponse());

        ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
                                                                            ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
@ -143,13 +147,20 @@ public class LayoutParsingPipeline {

        log.info("Creating viewer document for {}", layoutParsingRequest.identifier());

-        layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
+        layoutGridService.addLayoutGrid(viewerDocumentFile,
+                                        documentGraph,
+                                        viewerDocumentFile,
+                                        false,
+                                        layoutParsingRequest.visualLayoutParsingFileId()
+                                                .isPresent());

        log.info("Storing resulting files for {}", layoutParsingRequest.identifier());

        layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
-        if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
-            layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
+        if (layoutParsingRequest.documentMarkdownFileStorageId()
+                .isPresent()) {
+            layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
+                                                                  .get(), new MarkdownMapper().toMarkdownContent(documentGraph));
        }
        layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
        layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
@ -246,7 +257,7 @@ public class LayoutParsingPipeline {
        OutlineObject lastProcessedOutlineObject = null;

        // parsing the structure elements could be useful as well
-        if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
+        if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
            classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
        }

@ -324,7 +335,7 @@ public class LayoutParsingPipeline {
            classificationPage.setPageWidth(cropbox.getWidth());
            classificationPage.setPageHeight(cropbox.getHeight());

-            if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
+            if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
                List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());

                OutlineObject notFoundOutlineObject = null;
@ -379,6 +390,12 @@ public class LayoutParsingPipeline {
            case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
        }

+        if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
+            for (ClassificationPage page : classificationDocument.getPages()) {
+                docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
+            }
+        }
+
        List<TextPageBlock> headlines = classificationDocument.getPages()
                .stream()
                .flatMap(classificationPage -> classificationPage.getTextBlocks()
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java
@ -133,7 +133,7 @@ public abstract class BoundingBox {
    }


-    private boolean intersectsX(BoundingBox other, float threshold) {
+    public boolean intersectsX(BoundingBox other, float threshold) {

        return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
    }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java
@ -1,12 +1,15 @@
 package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;

+import java.util.Comparator;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
+import java.util.stream.Stream;

 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;

@ -29,9 +32,8 @@ public class Page {
    Integer height;
    Integer width;
    Integer rotation;
-
    @EqualsAndHashCode.Exclude
-    List<SemanticNode> mainBody;
+    List<AtomicTextBlock> textBlocksOnPage;
    @EqualsAndHashCode.Exclude
    Header header;
    @EqualsAndHashCode.Exclude
@ -53,20 +55,43 @@ public class Page {
                .width((int) classificationPage.getPageWidth())
                .number(classificationPage.getPageNumber())
                .rotation(classificationPage.getRotation())
-                .mainBody(new LinkedList<>())
+                .textBlocksOnPage(new LinkedList<>())
                .build();
    }


+    /**
+     * Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
+     *
+     * @return The main body text block.
+     */
    public TextBlock getMainBodyTextBlock() {

-        return mainBody.stream()
-                .filter(SemanticNode::isLeaf)
-                .map(SemanticNode::getLeafTextBlock)
+        return textBlocksOnPage.stream()
                .collect(new TextBlockCollector());
    }


+    public List<SemanticNode> getMainBody() {
+
+        return textBlocksOnPage.stream()
+                .map(AtomicTextBlock::getParent)
+                .map(this::getHighestParentOnPage)
+                .distinct()
+                .toList();
+    }
+
+
+    private SemanticNode getHighestParentOnPage(SemanticNode node) {
+
+        SemanticNode currentNode = node;
+        while (currentNode.getParent().onlyOnPage(this)) {
+            currentNode = currentNode.getParent();
+        }
+        return currentNode;
+    }
+
+
    @Override
    public String toString() {

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java
@ -74,7 +74,8 @@ public interface SemanticNode {

        return getTextBlock().getPages()
                .stream()
-                .min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
+                .min(Comparator.comparingInt(Page::getNumber))
+                .orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
    }


@ -504,4 +505,17 @@ public interface SemanticNode {

    void accept(NodeVisitor visitor);

+
+    /**
+     * Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
+     *
+     * @param page the page to check
+     * @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
+     */
+    default boolean onlyOnPage(Page page) {
+
+        Set<Page> pages = getPages();
+        return pages.size() == 1 && pages.contains(page);
+    }
+
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java
@ -2,19 +2,23 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica

 import java.util.ArrayList;
 import java.util.List;
+import java.util.ListIterator;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

 import org.springframework.stereotype.Service;

+import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
+import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;

+@SuppressWarnings("all")
@Service
 public class DocuMineBlockificationService {

@ -57,8 +61,11 @@ public class DocuMineBlockificationService {
            boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
            boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
            boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
-                                                && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
-                                                    || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
+                                                && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold")
+                                                    //
+                                                    || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")
+                                                    || Math.abs(prev.getFontSize() - word.getFontSize()) >= 1
+                                                    || Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8);

            Matcher matcher = pattern.matcher(chunkWords.stream()
                                                      .collect(Collectors.joining(" ")).toString());
@ -120,5 +127,77 @@ public class DocuMineBlockificationService {
        return new ClassificationPage(textPageBlocks);
    }

+
+    public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
+
+        var blocks = page.getTextBlocks();
+        ListIterator<AbstractPageBlock> itty = blocks.listIterator();
+        while (itty.hasNext()) {
+            AbstractPageBlock block = itty.next();
+            if (block == null) {
+                continue;
+            }
+            if (block instanceof TablePageBlock) {
+                continue;
+            }
+
+            TextPageBlock current = (TextPageBlock) block;
+
+            for (int i = 0; i < blocks.size(); i++) {
+
+                AbstractPageBlock abstractPageBlock = blocks.get(i);
+                if (abstractPageBlock == null) {
+                    continue;
+                }
+                if (abstractPageBlock == current) {
+                    continue;
+                }
+                if (abstractPageBlock instanceof TablePageBlock) {
+                    continue;
+                }
+
+                if (isHeadlineFromOutline(current) || isHeadlineFromOutline(abstractPageBlock)) {
+                    continue;
+                }
+
+                TextPageBlock inner = (TextPageBlock) abstractPageBlock;
+
+                if (usedRulings.lineBetween(current, blocks.get(i))) {
+                    continue;
+                }
+
+                if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
+                        .equals(inner.getClassification()))) {
+
+                    boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
+                    current.getSequences().addAll(inner.getSequences());
+                    current = buildTextBlock(current.getSequences(), 0);
+                    current.setClassification(inner.getClassification());
+                    current.setToDuplicate(toDuplicate);
+                    blocks.set(i, null);
+                    itty.set(current);
+                }
+            }
+        }
+        var blocksIterator = blocks.iterator();
+        while (blocksIterator.hasNext()) {
+            if (blocksIterator.next() == null) {
+                blocksIterator.remove();
+            }
+        }
+    }
+
+
+    private boolean isHeadlineFromOutline(AbstractPageBlock abstractPageBlock) {
+
+        return abstractPageBlock.getEngines().contains(LayoutEngine.OUTLINE) && abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline();
+    }
+
+
+    public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
+
+        return new TextPageBlock(wordBlockList);
+    }
+
 }

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java
@ -74,7 +74,7 @@ public class DocuMineClassificationService {
            return;
        }
        if (document.getFontSizeCounter().getMostPopular() == null) {
-            textBlock.setClassification(PageBlockType.OTHER);
+            textBlock.setClassification(PageBlockType.PARAGRAPH);
            return;
        }
        if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
@ -108,7 +108,10 @@ public class DocuMineClassificationService {
                       && Character.isDigit(textBlock.toString().charAt(0))
                       && atLeast3Matcher.reset().find()
                       && !textBlock.toString().contains(":") //
-                       || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") //
+                       || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT))
+                          && atLeast3Matcher.reset().find()
+                          && !textBlock.toString().contains(":")
+                          && !textBlock.toString().startsWith("(")//
                       || textBlock.toString().startsWith("APPENDIX") //
                       || textBlock.toString().startsWith("FIGURE") //
                       || textBlock.toString().startsWith("Continued TABLE") //
@ -143,9 +146,9 @@ public class DocuMineClassificationService {
                   && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
-            textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
+            textBlock.setClassification(PageBlockType.PARAGRAPH);
        } else {
-            textBlock.setClassification(PageBlockType.OTHER);
+            textBlock.setClassification(PageBlockType.PARAGRAPH);
        }
    }

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java
@ -6,6 +6,7 @@ import static java.util.stream.Collectors.toList;

 import java.awt.geom.Rectangle2D;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
@ -15,6 +16,7 @@ import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;

+import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
@ -32,7 +34,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -68,10 +72,25 @@ public class DocumentGraphFactory {
        documentGraph.setPages(context.pages.keySet());
        documentGraph.setDocumentTree(context.documentTree);
        documentGraph.setTextBlock(documentGraph.getTextBlock());
+        addTextBlocksToPages(documentGraph);
+
        return documentGraph;
    }


+    private void addTextBlocksToPages(Document documentGraph) {
+
+        documentGraph.streamAllSubNodes()
+                .filter(SemanticNode::isLeaf)
+                .filter(node -> !node.getType().equals(NodeType.HEADER))
+                .filter(node -> !node.getType().equals(NodeType.FOOTER))
+                .map(SemanticNode::getTextBlock)
+                .map(TextBlock::getAtomicTextBlocks)
+                .flatMap(Collection::stream)
+                .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
+    }
+
+
    private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {

        for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
@ -105,8 +124,6 @@ public class DocumentGraphFactory {
            node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
        }

-        page.getMainBody().add(node);
-
        List<TextPageBlock> textBlocks = new ArrayList<>();
        textBlocks.add(originalTextBlock);
        textBlocks.addAll(textBlocksToMerge);
@ -115,9 +132,9 @@ public class DocumentGraphFactory {

        if (node instanceof DuplicatedParagraph duplicatedParagraph) {
            AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock2(textBlocks.stream()
-                                                                                                      .flatMap(tb -> tb.getSequences()
-                                                                                                              .stream())
-                                                                                                      .collect(Collectors.toList()), node, context, page);
+                                                                                                       .flatMap(tb -> tb.getSequences()
+                                                                                                               .stream())
+                                                                                                       .collect(Collectors.toList()), node, context, page);
            duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
        }

@ -141,7 +158,7 @@ public class DocumentGraphFactory {

        Rectangle2D position = image.getPosition();
        Page page = context.getPage(image.getPage());
-        Image imageNode = Image.builder()
+        return Image.builder()
                .id(IdBuilder.buildId(Set.of(page), List.of(position)))
                .imageType(image.getImageType())
                .position(position)
@ -150,8 +167,6 @@ public class DocumentGraphFactory {
                .representationHash(image.getRepresentation())
                .documentTree(context.getDocumentTree())
                .build();
-        page.getMainBody().add(imageNode);
-        return imageNode;
    }


--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java
@ -51,9 +51,6 @@ public class SectionNodeFactory {
            return Optional.empty();
        }

-        Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
-                .collect(groupingBy(AbstractPageBlock::getPage));
-
        AbstractSemanticNode section;
        boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
        if (isLeaf && !containsTablesAndTextBlocks) {
@ -63,8 +60,6 @@ public class SectionNodeFactory {
        }

        context.getSections().add(section);
-        blocksPerPage.keySet()
-                .forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));

        section.setTreeId(getTreeId(parentNode, context, section));

@ -242,10 +237,5 @@ public class SectionNodeFactory {
    }


-    private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) {
-
-        Page page = context.getPage(pageNumber);
-        page.getMainBody().add(section);
-    }

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java
@ -51,8 +51,6 @@ public class TableNodeFactory {
                .numberOfRows(mergedRows.size())
                .build();

-        pages.forEach(page -> addTableToPage(page, parentNode, table));
-
        List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
        table.setTreeId(treeId);
        addTableCells(layoutParsingType, mergedRows, table, context, document);
@ -82,17 +80,6 @@ public class TableNodeFactory {
    }


-    @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
-    private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
-
-        if (!page.getMainBody().contains(parentNode)) {
-            parentNode.getPages().add(page);
-        }
-
-        page.getMainBody().add(table);
-    }
-
-
    private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {

        if (table.streamHeaders()
@ -107,14 +94,7 @@ public class TableNodeFactory {

        for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
            for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
-                addTableCell(layoutParsingType,
-                             rows.get(rowIndex)
-                                     .get(colIndex),
-                             rowIndex,
-                             colIndex,
-                             table,
-                             context,
-                             document);
+                addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context, document);
            }
        }
    }
@ -131,14 +111,7 @@ public class TableNodeFactory {

        Page page = context.getPage(cell.getPageNumber());

-        TableCell tableCell = TableCell.builder()
-                .documentTree(context.getDocumentTree())
-                .row(rowIndex)
-                .col(colIndex)
-                .header(cell.isHeaderCell())
-                .bBox(cell.getBBoxPdf())
-                .build();
-        page.getMainBody().add(tableCell);
+        TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBBoxPdf()).build();

        List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
        tableCell.setTreeId(treeId);
@ -147,9 +120,7 @@ public class TableNodeFactory {
        if (cell.getTextBlocks().isEmpty()) {
            tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
        } else if (cell.getTextBlocks().size() == 1) {
-            textBlock = context.getTextBlockFactory()
-                    .buildAtomicTextBlock2(cell.getTextBlocks()
-                                                  .get(0).getSequences(), tableCell, context, page);
+            textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
            tableCell.setLeafTextBlock(textBlock);
        } else if (firstTextBlockIsHeadline(cell)) {
            SectionNodeFactory.addSection(layoutParsingType,
@ -181,8 +152,7 @@ public class TableNodeFactory {

    private boolean firstTextBlockIsHeadline(Cell cell) {

-        return cell.getTextBlocks()
-                .get(0).isHeadline();
+        return cell.getTextBlocks().get(0).isHeadline();
    }

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java
@ -40,27 +40,26 @@ public class TextBlockFactory {
            orientation = sequences.get(0).getDir().toString();
            textRotation = sequences.get(0).getDir().getRotation();
        }
-        return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
-                                                              searchTextWithTextPositionDto.getLineBreaks(),
-                                                              searchTextWithTextPositionDto.getBoldTextBoundaries(),
-                                                              searchTextWithTextPositionDto.getItalicTextBoundaries(),
-                                                              searchTextWithTextPositionDto.getPositions(),
-                                                              searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
-                                                              idx,
-                                                              parent,
-                                                              numberOnPage,
-                                                              page,
-                                                              offset,
-                                                              orientation,
-                                                              textRotation);
+        var atb = AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
+                                                                 searchTextWithTextPositionDto.getLineBreaks(),
+                                                                 searchTextWithTextPositionDto.getBoldTextBoundaries(),
+                                                                 searchTextWithTextPositionDto.getItalicTextBoundaries(),
+                                                                 searchTextWithTextPositionDto.getPositions(),
+                                                                 searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
+                                                                 idx,
+                                                                 parent,
+                                                                 numberOnPage,
+                                                                 page,
+                                                                 offset,
+                                                                 orientation,
+                                                                 textRotation);
+        return atb;
    }


    public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {

-        long idx = textBlockIdx;
-        textBlockIdx++;
-        return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
+        return emptyTextBlock(parent, context.getAndIncrementTextBlockNumberOnPage(page), page);
    }


@ -68,7 +67,8 @@ public class TextBlockFactory {

        long idx = textBlockIdx;
        textBlockIdx++;
-        return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
+        var atb = AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
+        return atb;
    }

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java
@ -41,7 +41,9 @@ public class DocumentGraphMapper {
        DocumentTree documentTree = new DocumentTree(document);
        Context context = new Context(documentData, documentTree);

-        context.pages.addAll(Arrays.stream(documentData.getDocumentPages()).map(DocumentGraphMapper::buildPage).toList());
+        context.pages.addAll(Arrays.stream(documentData.getDocumentPages())
+                                     .map(DocumentGraphMapper::buildPage)
+                                     .toList());

        context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));

@ -59,7 +61,9 @@ public class DocumentGraphMapper {
        List<DocumentTree.Entry> newEntries = new LinkedList<>();
        for (DocumentStructure.EntryData entryData : entries) {

-            List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
+            List<Page> pages = Arrays.stream(entryData.getPageNumbers())
+                    .map(pageNumber -> getPage(pageNumber, context))
+                    .toList();

            SemanticNode node = switch (entryData.getType()) {
                case SECTION -> buildSection(context);
@ -77,16 +81,17 @@ public class DocumentGraphMapper {
            if (entryData.getAtomicBlockIds().length > 0) {
                TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
                node.setLeafTextBlock(textBlock);
+                switch (entryData.getType()) {
+                    case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
+                    case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
+                    default -> textBlock.getAtomicTextBlocks()
+                            .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
+                }
            }
-            List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
+            List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
+                    .toList();
            node.setTreeId(treeId);

-            switch (entryData.getType()) {
-                case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
-                case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
-                default -> pages.forEach(page -> page.getMainBody().add(node));
-            }
-
            newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
        }
        return newEntries;
@ -142,6 +147,7 @@ public class DocumentGraphMapper {
        return Section.builder().documentTree(context.documentTree).build();
    }

+
    private SuperSection buildSuperSection(Context context) {

        return SuperSection.builder().documentTree(context.documentTree).build();
@ -166,22 +172,24 @@ public class DocumentGraphMapper {

    private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {

-        return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
+        return Arrays.stream(atomicTextBlockIds)
+                .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
+                .collect(new TextBlockCollector());
    }


    private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {

        return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
-                context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
-                parent,
-                getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
+                                                       context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
+                                                       parent,
+                                                       getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
    }


    private Page buildPage(DocumentPage p) {

-        return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
+        return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
    }


@ -206,8 +214,10 @@ public class DocumentGraphMapper {

            this.documentTree = documentTree;
            this.pages = new LinkedList<>();
-            this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
-            this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
+            this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData())
+                    .toList();
+            this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions())
+                    .toList();

        }

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java
@ -1,9 +1,10 @@
 package com.knecon.fforesight.service.layoutparser.processor.utils;

 import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.LinkedList;
 import java.util.List;
-import java.util.stream.Stream;

 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
@ -22,29 +23,83 @@ public class TableMergingUtility {
        List<TablePageBlock> consecutiveTables = pageBlocks.stream()
                .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
                .filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock))
+                .sorted(Comparator.comparingInt(TablePageBlock::getPage).thenComparing(TablePageBlock::getY).thenComparing(TablePageBlock::getX))
                .toList();
+
        assert consecutiveTables.size() == pageBlocks.size() - 1;
+        var currentTable = originalTablePageBlock;
+        int currentTableIndex = 0;

        List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
-        for (TablePageBlock consecutiveTable : consecutiveTables) {
-            if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
-                    consecutiveTable)) {
+        consecutiveTablesWithSameColCountAndHeaders.add(originalTablePageBlock);
+        for (int i = 0; i < consecutiveTables.size(); i++) {
+            TablePageBlock consecutiveTable = consecutiveTables.get(i);
+
+            if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() //
+                && headersMatch(originalTablePageBlock, consecutiveTable) //
+                && outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) //
+                && consecutiveOrSamePage(currentTable, consecutiveTable) //
+                && !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) {
+
+                currentTable = consecutiveTable;
+                currentTableIndex = i;
                consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
            }
        }
-        return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
+        return consecutiveTablesWithSameColCountAndHeaders;
+    }
+
+
+    private static List<TablePageBlock> findTablesBetween(List<TablePageBlock> consecutiveTables, int currentTableIndex, int i) {
+
+        if (currentTableIndex + 1 == consecutiveTables.size() || currentTableIndex + 1 >= i) {
+            return Collections.emptyList();
+        }
+        return consecutiveTables.subList(currentTableIndex + 1, i);
+    }
+
+
+    private static boolean consecutiveOrSamePage(TablePageBlock currentTable, TablePageBlock consecutiveTable) {
+
+        return currentTable.getPage() == consecutiveTable.getPage() || currentTable.getPage() + 1 == consecutiveTable.getPage();
+    }
+
+
+    private static boolean tableBetween(TablePageBlock currentTable, TablePageBlock consecutiveTable, List<TablePageBlock> tablesBetween) {
+
+        if (tablesBetween.isEmpty()) {
+            return false;
+        }
+        // assumes the tables are on the same page or on consecutive pages, all tables on pages in between are ignored.
+        return tablesBetween.stream()
+                       .filter(tableBetween -> tableBetween.getPage() == currentTable.getPage())
+                       .anyMatch(tableBetween -> tableBetween.isBelow(currentTable)) //
+               || tablesBetween.stream()
+                       .filter(tableBetween -> tableBetween.getPage() == consecutiveTable.getPage())
+                       .anyMatch(tableBetween -> tableBetween.isAbove(consecutiveTable));
+    }
+
+
+    private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) {
+
+        return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable));
    }


    private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {

-        return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
+        return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD
+               && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
    }


-    private boolean hasTableHeader(TablePageBlock table) {
+    private List<Cell> getHeaders(TablePageBlock table) {

-        return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);
+        return table.getRows()
+                .stream()
+                .flatMap(Collection::stream)
+                .filter(Cell::isHeaderCell)
+                .toList();
    }

 }
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java
@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
 public class LayoutparserEnd2EndTest extends AbstractTest {

-    public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
+    public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;

    @Autowired
    private LayoutParsingPipeline layoutParsingPipeline;
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
    @Disabled
    public void testLayoutParserEndToEnd() {

-        String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
+        String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327/560e6ab1ab4754b9a62fd2e6d4d71327.ORIGIN.pdf";

        runForFile(filePath);
    }
Author	SHA1	Message	Date
Kilian Schüttler	de266dcfe5	Merge branch 'RED-9964' into 'release/0.159.x' Red 9964: don't merge tables on non-consecutive pages or with tables in between See merge request fforesight/layout-parser!204	2024-08-30 14:00:50 +02:00
Kilian Schüttler	10e525f0de	Red 9964: don't merge tables on non-consecutive pages or with tables in between	2024-08-30 14:00:50 +02:00
Dominique Eifländer	e0e5e35b30	Merge branch 'RED-9974-4.2' into 'release/0.159.x' RED-9974: Improved headline detection for documine old See merge request fforesight/layout-parser!203	2024-08-30 10:52:31 +02:00
Dominique Eifländer	e1d8d1ea3b	RED-9974: Improved headline detection for documine old	2024-08-30 10:35:24 +02:00
Kilian Schüttler	1546c05dd8	Merge branch 'RED-9975-bp' into 'release/0.159.x' activate outline detection See merge request fforesight/layout-parser!200	2024-08-29 14:26:14 +02:00
Kilian Schuettler	7c88c30ca7	RED-9975: activate outline detection	2024-08-29 14:17:20 +02:00
Kilian Schüttler	50427d08dc	Merge branch 'RED-9975-bp' into 'release/0.159.x' RED-9975: activate outline detection See merge request fforesight/layout-parser!199	2024-08-29 12:43:14 +02:00
Kilian Schuettler	338c6c5dd0	RED-9975: activate outline detection	2024-08-29 12:27:20 +02:00