RED-9139: add new TableOfContents Node

* rename previous TableOfContent to SectionTree * added protobuf compile script
2024-11-08 14:57:46 +01:00 · 2024-11-08 14:57:46 +01:00 · 621ebd7378
commit 621ebd7378
parent 4b86307936
4 changed files with 305 additions and 307 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -39,8 +39,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
 import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
-import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
+import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -105,8 +105,8 @@ public class LayoutParsingPipeline {
    VisualLayoutParsingAdapter visualLayoutParsingAdapter;
    GraphicExtractorService graphicExtractorService;
    OutlineExtractorService outlineExtractorService;
-    OutlineValidationService outlineValidationService;
    SectionTreeBuilderService sectionTreeBuilderService;
+    SectionTreeEnhancementService sectionTreeEnhancementService;
    LayoutparserSettings settings;
    ClassificationService classificationService;

@ -344,14 +344,14 @@ public class LayoutParsingPipeline {

        classificationService.classify(classificationDocument, layoutParsingType, identifier);

-        SectionTree sectionTree = outlineValidationService.createSectionTree(classificationDocument);
+        SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
        classificationDocument.setSectionTree(sectionTree);

        log.info("Building Sections for {}", identifier);

        switch (layoutParsingType) {
            case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
-            default -> sectionTreeBuilderService.assignSectionBlocksAndImages(classificationDocument);
+            default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
        }

        return classificationDocument;
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java
@ -1,84 +0,0 @@
-package com.knecon.fforesight.service.layoutparser.processor.model.outline;
-
-import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
-import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeSet;
-
-import org.springframework.stereotype.Service;
-
-import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
-import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
-
-import io.micrometer.observation.annotation.Observed;
-import lombok.extern.slf4j.Slf4j;
-
-@Service
-@Slf4j
-public class OutlineValidationService {
-
-    @Observed(name = "OutlineValidationService", contextualName = "create-toc")
-    public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
-
-        List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
-
-        List<SectionTreeEntry> mainSections = new ArrayList<>();
-        Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
-        SectionTreeEntry last = null;
-        TreeSet<Integer> depths = new TreeSet<>();
-
-        for (TextPageBlock current : headlines) {
-            int currentDepth = getHeadlineNumber(current.getClassification());
-            Integer parentDepth = depths.floor(currentDepth - 1);
-
-            var tocItem = new SectionTreeEntry(current);
-
-            if (parentDepth == null) {
-                mainSections.add(tocItem);
-                lastItemsPerDepth = new HashMap<>();
-                depths = new TreeSet<>();
-
-            } else {
-                assert last != null;
-                int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
-                if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
-                    // headline after toc should always start a main section
-                    parentDepth = 1;
-                } else if (lastDepth < parentDepth) {
-                    parentDepth = lastDepth;
-                } else if (lastDepth == currentDepth && last.getParent() != null) {
-                    parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
-                }
-
-                SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
-                parent.addChild(tocItem);
-            }
-
-            last = tocItem;
-            lastItemsPerDepth.put(currentDepth, tocItem);
-            depths.add(currentDepth);
-        }
-
-        return new
-
-                SectionTree(mainSections);
-
-    }
-
-
-    private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
-
-        return classificationDocument.getPages()
-                .stream()
-                .flatMap(classificationPage -> classificationPage.getTextBlocks()
-                        .stream()
-                        .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
-                        .map(tb -> (TextPageBlock) tb))
-                .toList();
-    }
-
-}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeBuilderService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeBuilderService.java
@ -1,252 +1,82 @@
 package com.knecon.fforesight.service.layoutparser.processor.model.outline;

+import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
+import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
+
 import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;

 import org.springframework.stereotype.Service;

-import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
-import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
-import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
-import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
-import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
-import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
-import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
-import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;

+import io.micrometer.observation.annotation.Observed;
 import lombok.extern.slf4j.Slf4j;

-@Slf4j
@Service
+@Slf4j
 public class SectionTreeBuilderService {

-    public void assignSectionBlocksAndImages(ClassificationDocument document) {
+    @Observed(name = "OutlineValidationService", contextualName = "create-toc")
+    public SectionTree createSectionTree(ClassificationDocument classificationDocument) {

-        SectionTree toc = document.getSectionTree();
-        Iterator<SectionTreeEntry> iterator = toc.iterator();
-        SectionTreeEntry currentTOCItem = null;
-        if (iterator.hasNext()) {
-            currentTOCItem = iterator.next();
-        }
-        List<AbstractPageBlock> startBlocks = new ArrayList<>();
-        List<ClassifiedImage> startImages = new ArrayList<>();
-        SectionTreeEntry currentSection = null;
-        boolean foundFirstHeadline = false;
+        List<TextPageBlock> headlines = extractHeadlines(classificationDocument);

-        List<ClassificationHeader> headers = new ArrayList<>();
-        List<ClassificationFooter> footers = new ArrayList<>();
-        TablePageBlock previousTable = null;
-        List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
+        List<SectionTreeEntry> mainSections = new ArrayList<>();
+        Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
+        SectionTreeEntry last = null;
+        TreeSet<Integer> depths = new TreeSet<>();

-        for (ClassificationPage page : document.getPages()) {
-            List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
-            List<TextPageBlock> header = new ArrayList<>();
-            List<TextPageBlock> footer = new ArrayList<>();
-            for (AbstractPageBlock current : page.getTextBlocks()) {
+        for (TextPageBlock current : headlines) {
+            int currentDepth = getHeadlineNumber(current.getClassification());
+            Integer parentDepth = depths.floor(currentDepth - 1);

-                if (current.getClassification() == null) {
-                    continue;
+            var tocItem = new SectionTreeEntry(current);
+
+            if (parentDepth == null) {
+                mainSections.add(tocItem);
+                lastItemsPerDepth = new HashMap<>();
+                depths = new TreeSet<>();
+
+            } else {
+                assert last != null;
+                int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
+                if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
+                    // headline after toc should always start a main section
+                    parentDepth = 1;
+                } else if (lastDepth < parentDepth) {
+                    parentDepth = lastDepth;
+                } else if (lastDepth == currentDepth && last.getParent() != null) {
+                    parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
                }

-                current.setPage(page.getPageNumber());
-
-                if (current.getClassification().equals(PageBlockType.HEADER)) {
-                    header.add((TextPageBlock) current);
-                    continue;
-                }
-
-                if (current.getClassification().equals(PageBlockType.FOOTER)) {
-                    footer.add((TextPageBlock) current);
-                    continue;
-                }
-
-                if (current instanceof TablePageBlock table) {
-                    if (previousTable != null) {
-                        mergeTableMetadata(table, previousTable);
-                    }
-                    previousTable = table;
-                }
-
-                if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
-                    if (!foundFirstHeadline) {
-                        foundFirstHeadline = true;
-                    }
-                    currentSection = currentTOCItem;
-                    currentTOCItem.getSectionBlocks().add(current);
-                    currentPageTOCItems.add(currentTOCItem);
-
-                    if (iterator.hasNext()) {
-                        currentTOCItem = iterator.next();
-                    }
-                } else if (!foundFirstHeadline) {
-                    startBlocks.add(current);
-                } else {
-                    currentSection.getSectionBlocks().add(current);
-                }
+                SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
+                parent.addChild(tocItem);
            }

-            if (!currentPageTOCItems.isEmpty()) {
-                lastFoundTOCItems = currentPageTOCItems;
-            }
-
-            for (ClassifiedImage image : page.getImages()) {
-
-                Double xMin = null;
-                Double yMin = null;
-                Double xMax = null;
-                Double yMax = null;
-
-                for (SectionTreeEntry tocItem : lastFoundTOCItems) {
-                    var headline = tocItem.getHeadline();
-
-                    if (headline.getPage() != page.getPageNumber()) {
-                        continue;
-                    }
-
-                    if (headline.getMinX() < headline.getMaxX()) {
-                        if (xMin == null || headline.getMinX() < xMin) {
-                            xMin = headline.getMinX();
-                        }
-                        if (xMax == null || headline.getMaxX() > xMax) {
-                            xMax = headline.getMaxX();
-                        }
-                    } else {
-                        if (xMin == null || headline.getMaxX() < xMin) {
-                            xMin = headline.getMaxX();
-                        }
-                        if (xMax == null || headline.getMinX() > xMax) {
-                            xMax = headline.getMinX();
-                        }
-                    }
-
-                    if (headline.getMinY() < headline.getMaxY()) {
-                        if (yMin == null || headline.getMinY() < yMin) {
-                            yMin = headline.getMinY();
-                        }
-                        if (yMax == null || headline.getMaxY() > yMax) {
-                            yMax = headline.getMaxY();
-                        }
-                    } else {
-                        if (yMin == null || headline.getMaxY() < yMin) {
-                            yMin = headline.getMaxY();
-                        }
-                        if (yMax == null || headline.getMinY() > yMax) {
-                            yMax = headline.getMinY();
-                        }
-                    }
-
-                    log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
-                    log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
-
-                    if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
-                        tocItem.getImages().add(image);
-                        image.setAppendedToSection(true);
-                        break;
-                    }
-                }
-                if (!image.isAppendedToSection()) {
-                    log.debug("Image uses last found section");
-                    if (!lastFoundTOCItems.isEmpty()) {
-                        lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
-                    } else {
-                        startImages.add(image);
-                    }
-                    image.setAppendedToSection(true);
-                }
-            }
-
-            if (!header.isEmpty()) {
-                headers.add(new ClassificationHeader(header));
-            }
-            if (!footer.isEmpty()) {
-                footers.add(new ClassificationFooter(footer));
-            }
+            last = tocItem;
+            lastItemsPerDepth.put(currentDepth, tocItem);
+            depths.add(currentDepth);
        }

-        if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
-            SectionTreeEntry unassigned = new SectionTreeEntry(null);
-            unassigned.setSectionBlocks(startBlocks);
-            unassigned.setImages(startImages);
-            document.getSectionTree().getMainSections().add(0, unassigned);
-        }
-        document.setHeaders(headers);
-        document.setFooters(footers);
+        return new SectionTree(mainSections);
+
    }


-    private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
+    private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {

-        // Distribute header information for subsequent tables
-        if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
-            List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
-            List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
-            // Allow merging of tables if header row is separated from first logical non-header row
-            if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
-                previousTableNonHeaderRow = previousTable.getRows().get(0)
-                        .stream()
-                        .map(cell -> {
-                            Cell fakeCell = Cell.copy(cell);
-                            fakeCell.setHeaderCells(Collections.singletonList(cell));
-                            return fakeCell;
-                        })
-                        .toList();
-            }
-            if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
-                for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
-                    List<Cell> row = currentTable.getRows().get(i);
-                    if (row.size() == tableNonHeaderRow.size() && row.stream()
-                            .allMatch(cell -> cell.getHeaderCells().isEmpty())) {
-                        for (int j = 0; j < row.size(); j++) {
-                            row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-
-    private boolean hasValidHeaderInformation(TablePageBlock table) {
-
-        return !hasInvalidHeaderInformation(table);
-    }
-
-
-    private boolean hasInvalidHeaderInformation(TablePageBlock table) {
-
-        return table.getRows()
+        return classificationDocument.getPages()
                .stream()
-                .flatMap(Collection::stream)
-                .allMatch(cell -> cell.getHeaderCells().isEmpty());
-    }
-
-
-    private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
-
-        for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
-            List<Cell> row = table.getRows().get(i);
-            if (row.size() == 1) {
-                continue;
-            }
-            boolean allNonHeader = true;
-            for (Cell cell : row) {
-                if (cell.isHeaderCell()) {
-                    allNonHeader = false;
-                    break;
-                }
-            }
-            if (allNonHeader) {
-                return row;
-            }
-        }
-
-        return Collections.emptyList();
-
+                .flatMap(classificationPage -> classificationPage.getTextBlocks()
+                        .stream()
+                        .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
+                        .map(tb -> (TextPageBlock) tb))
+                .toList();
    }

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeEnhancementService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeEnhancementService.java
@ -0,0 +1,252 @@
+package com.knecon.fforesight.service.layoutparser.processor.model.outline;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.springframework.stereotype.Service;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
+import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
+import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
+import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
+import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
+import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+@Service
+public class SectionTreeEnhancementService {
+
+    public void assignSectionBlocksAndImages(ClassificationDocument document) {
+
+        SectionTree toc = document.getSectionTree();
+        Iterator<SectionTreeEntry> iterator = toc.iterator();
+        SectionTreeEntry currentTOCItem = null;
+        if (iterator.hasNext()) {
+            currentTOCItem = iterator.next();
+        }
+        List<AbstractPageBlock> startBlocks = new ArrayList<>();
+        List<ClassifiedImage> startImages = new ArrayList<>();
+        SectionTreeEntry currentSection = null;
+        boolean foundFirstHeadline = false;
+
+        List<ClassificationHeader> headers = new ArrayList<>();
+        List<ClassificationFooter> footers = new ArrayList<>();
+        TablePageBlock previousTable = null;
+        List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
+
+        for (ClassificationPage page : document.getPages()) {
+            List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
+            List<TextPageBlock> header = new ArrayList<>();
+            List<TextPageBlock> footer = new ArrayList<>();
+            for (AbstractPageBlock current : page.getTextBlocks()) {
+
+                if (current.getClassification() == null) {
+                    continue;
+                }
+
+                current.setPage(page.getPageNumber());
+
+                if (current.getClassification().equals(PageBlockType.HEADER)) {
+                    header.add((TextPageBlock) current);
+                    continue;
+                }
+
+                if (current.getClassification().equals(PageBlockType.FOOTER)) {
+                    footer.add((TextPageBlock) current);
+                    continue;
+                }
+
+                if (current instanceof TablePageBlock table) {
+                    if (previousTable != null) {
+                        mergeTableMetadata(table, previousTable);
+                    }
+                    previousTable = table;
+                }
+
+                if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
+                    if (!foundFirstHeadline) {
+                        foundFirstHeadline = true;
+                    }
+                    currentSection = currentTOCItem;
+                    currentTOCItem.getSectionBlocks().add(current);
+                    currentPageTOCItems.add(currentTOCItem);
+
+                    if (iterator.hasNext()) {
+                        currentTOCItem = iterator.next();
+                    }
+                } else if (!foundFirstHeadline) {
+                    startBlocks.add(current);
+                } else {
+                    currentSection.getSectionBlocks().add(current);
+                }
+            }
+
+            if (!currentPageTOCItems.isEmpty()) {
+                lastFoundTOCItems = currentPageTOCItems;
+            }
+
+            for (ClassifiedImage image : page.getImages()) {
+
+                Double xMin = null;
+                Double yMin = null;
+                Double xMax = null;
+                Double yMax = null;
+
+                for (SectionTreeEntry tocItem : lastFoundTOCItems) {
+                    var headline = tocItem.getHeadline();
+
+                    if (headline.getPage() != page.getPageNumber()) {
+                        continue;
+                    }
+
+                    if (headline.getMinX() < headline.getMaxX()) {
+                        if (xMin == null || headline.getMinX() < xMin) {
+                            xMin = headline.getMinX();
+                        }
+                        if (xMax == null || headline.getMaxX() > xMax) {
+                            xMax = headline.getMaxX();
+                        }
+                    } else {
+                        if (xMin == null || headline.getMaxX() < xMin) {
+                            xMin = headline.getMaxX();
+                        }
+                        if (xMax == null || headline.getMinX() > xMax) {
+                            xMax = headline.getMinX();
+                        }
+                    }
+
+                    if (headline.getMinY() < headline.getMaxY()) {
+                        if (yMin == null || headline.getMinY() < yMin) {
+                            yMin = headline.getMinY();
+                        }
+                        if (yMax == null || headline.getMaxY() > yMax) {
+                            yMax = headline.getMaxY();
+                        }
+                    } else {
+                        if (yMin == null || headline.getMaxY() < yMin) {
+                            yMin = headline.getMaxY();
+                        }
+                        if (yMax == null || headline.getMinY() > yMax) {
+                            yMax = headline.getMinY();
+                        }
+                    }
+
+                    log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
+                    log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
+
+                    if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
+                        tocItem.getImages().add(image);
+                        image.setAppendedToSection(true);
+                        break;
+                    }
+                }
+                if (!image.isAppendedToSection()) {
+                    log.debug("Image uses last found section");
+                    if (!lastFoundTOCItems.isEmpty()) {
+                        lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
+                    } else {
+                        startImages.add(image);
+                    }
+                    image.setAppendedToSection(true);
+                }
+            }
+
+            if (!header.isEmpty()) {
+                headers.add(new ClassificationHeader(header));
+            }
+            if (!footer.isEmpty()) {
+                footers.add(new ClassificationFooter(footer));
+            }
+        }
+
+        if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
+            SectionTreeEntry unassigned = new SectionTreeEntry(null);
+            unassigned.setSectionBlocks(startBlocks);
+            unassigned.setImages(startImages);
+            document.getSectionTree().getMainSections().add(0, unassigned);
+        }
+        document.setHeaders(headers);
+        document.setFooters(footers);
+    }
+
+
+    private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
+
+        // Distribute header information for subsequent tables
+        if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
+            List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
+            List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
+            // Allow merging of tables if header row is separated from first logical non-header row
+            if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
+                previousTableNonHeaderRow = previousTable.getRows().get(0)
+                        .stream()
+                        .map(cell -> {
+                            Cell fakeCell = Cell.copy(cell);
+                            fakeCell.setHeaderCells(Collections.singletonList(cell));
+                            return fakeCell;
+                        })
+                        .toList();
+            }
+            if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
+                for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+                    List<Cell> row = currentTable.getRows().get(i);
+                    if (row.size() == tableNonHeaderRow.size() && row.stream()
+                            .allMatch(cell -> cell.getHeaderCells().isEmpty())) {
+                        for (int j = 0; j < row.size(); j++) {
+                            row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+    private boolean hasValidHeaderInformation(TablePageBlock table) {
+
+        return !hasInvalidHeaderInformation(table);
+    }
+
+
+    private boolean hasInvalidHeaderInformation(TablePageBlock table) {
+
+        return table.getRows()
+                .stream()
+                .flatMap(Collection::stream)
+                .allMatch(cell -> cell.getHeaderCells().isEmpty());
+    }
+
+
+    private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
+
+        for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
+            List<Cell> row = table.getRows().get(i);
+            if (row.size() == 1) {
+                continue;
+            }
+            boolean allNonHeader = true;
+            for (Cell cell : row) {
+                if (cell.isHeaderCell()) {
+                    allNonHeader = false;
+                    break;
+                }
+            }
+            if (allNonHeader) {
+                return row;
+            }
+        }
+
+        return Collections.emptyList();
+
+    }
+
+}