RED-7074: Design Subsection section tree structure algorithm

* first draft: further implementations
2024-04-17 14:31:48 +02:00 · 2024-04-17 14:31:48 +02:00 · 17756f5977
commit 17756f5977
parent 59d9d6c3e6
11 changed files with 463 additions and 54 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -31,7 +31,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
 import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
-import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
+import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -44,7 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
 import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
 import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
 import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
-import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
+import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
 import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
 import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
 import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
@ -231,14 +231,15 @@ public class LayoutParsingPipeline {
        PDDocument originDocument = openDocument(originFile);
        addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));

-        OutlineObjectTree outlineObjectTree = outlineExtractorService.getOutlineObjectTree(originDocument);
-
        Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
        Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
        Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
        ClassificationDocument classificationDocument = new ClassificationDocument();
        List<ClassificationPage> classificationPages = new ArrayList<>();

+        // parsing the structure elements could be useful as well
+        classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
+
        long pageCount = originDocument.getNumberOfPages();

        for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
@ -296,10 +297,11 @@ public class LayoutParsingPipeline {
                case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
            };

-            List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
+            List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
                    .get(pageNumber - 1);
-            if(outlineObjects != null) {
-                blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage, outlineObjects);
+            if (outlineObjects != null) {
+                classificationPage.setOutlineObjects(outlineObjects);
+                blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage);
            }

            classificationPage.setCleanRulings(cleanRulings);
@ -361,7 +363,6 @@ public class LayoutParsingPipeline {
                .toList();
        // ???

-
        log.info("Building Sections for {}", identifier);

        switch (layoutParsingType) {
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
 import java.util.ArrayList;
 import java.util.List;

+import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;

@ -26,4 +27,6 @@ public class ClassificationDocument {

    private long rulesVersion;

+    private OutlineObjectTree outlineObjectTree;
+
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java
@ -8,13 +8,13 @@ import java.util.Map;

 import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
 import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
+import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;

 import lombok.Data;
 import lombok.NonNull;
 import lombok.RequiredArgsConstructor;
-import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;

@Data
@RequiredArgsConstructor
@ -23,6 +23,10 @@ public class ClassificationPage {
    @NonNull
    private List<AbstractPageBlock> textBlocks;

+    private List<OutlineObject> outlineObjects = new ArrayList<>();
+
+    private List<AbstractPageBlock> headlines  = new ArrayList<>();
+
    private List<ClassifiedImage> images = new ArrayList<>();

    private Rectangle bodyTextFrame;
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java
@ -1,4 +1,4 @@
-package com.knecon.fforesight.service.layoutparser.processor.services;
+package com.knecon.fforesight.service.layoutparser.processor.model.outline;

 import java.awt.geom.Point2D;
 import java.io.IOException;
@ -74,6 +74,8 @@ public class OutlineExtractorService {
    }


+    // if the structure elements are processed beforehand, another case can be handled here as well:
+    // outline objects can reference structure elements (see pdf documentation)
    @SneakyThrows
    private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java
@ -39,11 +39,4 @@ public class OutlineObjectTree {
        }
    }

-
-    @Override
-    public String toString() {
-
-        return super.toString();
-    }
-
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java
@ -0,0 +1,210 @@
+package com.knecon.fforesight.service.layoutparser.processor.model.outline;
+
+import java.awt.geom.Point2D;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import org.springframework.stereotype.Service;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.extern.slf4j.Slf4j;
+
+@Service
+@Slf4j
+public class OutlineValidationService {
+
+
+    public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
+        TableOfContents validatedToC = createToC(headlinesFromOutlines);
+        TableOfContents currentToC = createToC(allHeadlines);
+
+        TableOfContentItem lastHeadlineFromOutlines = null;
+        for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) {
+            if (!containsItem(validatedToC, tocItem)) {
+                addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines);
+            } else {
+                lastHeadlineFromOutlines = tocItem;
+            }
+        }
+        return validatedToC;
+    }
+
+    private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
+        for (TableOfContentItem existingItem : toc.getMainSections()) {
+            if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
+        for (TableOfContentItem existingItem : toc.getMainSections()) {
+            if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
+
+        if(!tocItem.getChildren().isEmpty()) {
+
+        }
+    }
+
+    public TableOfContents createToC(List<TextPageBlock> headlines) {
+
+        List<TableOfContentItem> mainSections = new ArrayList<>();
+        int parentDepth = 7; // more than 6 (h6)
+        TableOfContentItem parent = null;
+        for (TextPageBlock current : headlines) {
+            int currentDepth = getDepth(current.getClassification());
+            if(parentDepth >= currentDepth) {
+                parentDepth = currentDepth;
+                parent = new TableOfContentItem(current);
+                mainSections.add(parent);
+            } else {
+                assert (parent!=null);
+                while(parentDepth < currentDepth && parent.getParent() != null) {
+                    parent = parent.getParent();
+                    parentDepth = getDepth(parent.getTextPageBlock().getClassification());
+                }
+                parent.addChild(new TableOfContentItem(current));
+            }
+        }
+        return new TableOfContents(mainSections);
+
+    }
+
+    public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
+
+        List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()
+                .map(textPageBlock -> new OutlineObject(textPageBlock.getText(),
+                                                        textPageBlock.getPage(),
+                                                        new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()),
+                                                        getDepth(textPageBlock.getClassification())))
+                .toList();
+
+    }
+
+
+    private static int getDepth(PageBlockType pageBlockType) {
+
+        return switch (pageBlockType) {
+            case H1 -> 1;
+            case H2 -> 2;
+            case H3 -> 3;
+            case H4 -> 4;
+            case H5 -> 5;
+            default -> 6;
+        };
+    }
+
+
+    public void validate(List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
+
+        if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) {
+
+            List<Headline> headlines = allHeadlines.stream()
+                    .map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock)))
+                    .toList();
+            for (TextPageBlock newHeadline : newlyClassifiedHeadlines) {
+                int newHeadlineIndex = headlines.indexOf(newHeadline);
+                List<TextPageBlock> adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines);
+                // Find neighboring headlines from outlines
+                //TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1);
+                //TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1);
+
+                // If we have neighboring outlines, perform comparison
+                //if (previousOutline != null && nextOutline != null) {
+                //    // Compare headline orders
+                //    int orderComparison = compareHeadlineOrder(previousOutline, nextOutline);
+                //    if (orderComparison != 0) {
+                //        // Set classification based on comparison
+                //        setClassification(newHeadline, orderComparison, previousOutline, nextOutline);
+                //    }
+                //}
+            }
+        }
+
+    }
+
+
+    private List<TextPageBlock> findAdjacentNewlyClassified(TextPageBlock headline, List<TextPageBlock> newlyClassifiedHeadlines) {
+        // Find adjacent newly classified headlines
+        List<TextPageBlock> adjacentNewlyClassified = new ArrayList<>();
+        int index = newlyClassifiedHeadlines.indexOf(headline);
+        if (index != -1) {
+            adjacentNewlyClassified.add(headline);
+            for (int i = index - 1; i >= 0; i--) {
+                if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) {
+                    adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i));
+                } else {
+                    break;
+                }
+            }
+            for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) {
+                if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) {
+                    adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i));
+                } else {
+                    break;
+                }
+            }
+        }
+        return adjacentNewlyClassified;
+    }
+
+
+    private TextPageBlock findNeighboringOutline(TextPageBlock headline, List<TextPageBlock> headlinesFromOutlines, int direction) {
+        // Find neighboring headline from outlines in the specified direction
+        int index = headlinesFromOutlines.indexOf(headline);
+        if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) {
+            return headlinesFromOutlines.get(index + direction);
+        }
+        return null;
+    }
+
+
+    private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) {
+        // Compare headline orders
+        // Implement your comparison logic here
+        return 0; // Placeholder return, implement actual comparison logic
+    }
+
+
+    private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) {
+        // Set classification based on comparison with neighboring outlines
+        // Implement your classification logic here
+    }
+
+
+    record Headline(TextPageBlock textPageBlock, boolean newlyClassified) {
+
+        @Override
+        public boolean equals(Object obj) {
+
+            if (this == obj) {
+                return true;
+            }
+            if (obj == null || getClass() != obj.getClass()) {
+                return false;
+            }
+            Headline headline = (Headline) obj;
+            return Objects.equals(textPageBlock, headline.textPageBlock);
+        }
+
+
+        @Override
+        public int hashCode() {
+
+            return Objects.hash(textPageBlock);
+        }
+
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java
@ -0,0 +1,93 @@
+package com.knecon.fforesight.service.layoutparser.processor.model.outline;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+
+@Data
+@EqualsAndHashCode(onlyExplicitlyIncluded = true)
+public class TableOfContentItem {
+
+    @EqualsAndHashCode.Include
+    private TextPageBlock textPageBlock;
+    private List<TableOfContentItem> children = new ArrayList<>();
+    private TableOfContentItem parent;
+
+
+    public TableOfContentItem(TextPageBlock textPageBlock) {
+
+        this.textPageBlock = textPageBlock;
+    }
+
+
+    public void addChild(TableOfContentItem tableOfContentItem) {
+
+        children.add(tableOfContentItem);
+        tableOfContentItem.setParent(this);
+    }
+
+
+    public TableOfContentItem getSiblingBefore() {
+
+        try {
+            return parent.getChildren()
+                    .get(parent.getChildren().indexOf(this) - 1);
+        } catch (IndexOutOfBoundsException indexOutOfBoundsException) {
+            return null;
+        }
+    }
+    public TableOfContentItem getSiblingAfter() {
+
+        try {
+            return parent.getChildren()
+                    .get(parent.getChildren().indexOf(this) + 1);
+        } catch (IndexOutOfBoundsException indexOutOfBoundsException) {
+            return null;
+        }
+    }
+
+
+    public boolean contains(TextPageBlock block) {
+
+        boolean anyChildContains = false;
+        if (!children.isEmpty()) {
+            for (TableOfContentItem child : children) {
+                if (child.getTextPageBlock().equals(block)) {
+                    return true;
+                } else {
+                    anyChildContains = anyChildContains || child.contains(block);
+                }
+            }
+        }
+        return anyChildContains;
+    }
+
+
+    public boolean contains(TableOfContentItem tocItem) {
+
+        boolean anyChildContains = false;
+        if (!children.isEmpty()) {
+            for (TableOfContentItem child : children) {
+                if (child.equals(tocItem)) {
+                    return true;
+                } else {
+                    anyChildContains = anyChildContains || child.contains(tocItem);
+                }
+            }
+        }
+        return anyChildContains;
+    }
+
+
+    @Override
+    public String toString() {
+
+        return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}';
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java
@ -0,0 +1,59 @@
+package com.knecon.fforesight.service.layoutparser.processor.model.outline;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
+
+import lombok.Data;
+import lombok.RequiredArgsConstructor;
+
+@Data
+@RequiredArgsConstructor
+public class TableOfContents {
+
+    private List<TableOfContentItem> mainSections = new ArrayList<>();
+
+
+    public TableOfContents(List<TableOfContentItem> mainSections) {
+
+        this.mainSections = mainSections;
+    }
+
+
+    public List<TextPageBlock> getAllTextPageBlocks() {
+
+        List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
+        for (TableOfContentItem item : mainSections) {
+            collectTextPageBlocks(item, allTextPageBlocks);
+        }
+        return allTextPageBlocks;
+    }
+
+
+    private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
+
+        textPageBlocks.add(item.getTextPageBlock());
+        for (TableOfContentItem child : item.getChildren()) {
+            collectTextPageBlocks(child, textPageBlocks);
+        }
+    }
+
+    public List<TableOfContentItem> getAllTableOfContentItems() {
+        List<TableOfContentItem> allItems = new ArrayList<>();
+        for (TableOfContentItem item : mainSections) {
+            collectTableOfContentItems(item, allItems);
+        }
+        return allItems;
+    }
+
+    private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
+        allItems.add(item);
+        for (TableOfContentItem child : item.getChildren()) {
+            collectTableOfContentItems(child, allItems);
+        }
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java
@ -36,8 +36,9 @@ public class BlockificationPostprocessingService {
            .collect(RectangleTransformations.collectBBox());


-    public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
+    public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {

+        List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
        if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
            return;
        }
@ -244,6 +245,13 @@ public class BlockificationPostprocessingService {
    }


+    // currently only three cases are handled here:
+    // 1. equality
+    // 2. outline title contains block text
+    // 3. block text contains outline title
+    // another possible case is an intersection, meaning a title is split up between two different blocks
+    // this should not happen with how docstrum creates the blocks
+    // if it is indeed necessary, a splitting has to be done with a follow-up merge
    private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {

        OutlineObject outlineObject = context.getOutlineObject();
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java
@ -1,8 +1,13 @@
 package com.knecon.fforesight.service.layoutparser.processor.services.classification;

+import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;

+import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
+import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
+import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
+
 import org.springframework.stereotype.Service;

 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -22,15 +27,43 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
 public class RedactManagerClassificationService {

+    private final OutlineValidationService outlineValidationService;
+
+
    public void classifyDocument(ClassificationDocument document) {

        List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();

        log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());

+        List<TextPageBlock> headlinesFromOutlines = document.getPages()
+                .stream()
+                .flatMap(classificationPage -> classificationPage.getTextBlocks()
+                        .stream()
+                        .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline())
+                        .map(tb -> (TextPageBlock) tb))
+                .toList();
+
        for (ClassificationPage page : document.getPages()) {
            classifyPage(page, document, headlineFontSizes);
        }
+
+        List<TextPageBlock> allHeadlines = document.getPages()
+                .stream()
+                .flatMap(classificationPage -> classificationPage.getTextBlocks()
+                        .stream()
+                        .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null  && tb.getClassification().isHeadline())
+                        .map(tb -> (TextPageBlock) tb))
+                .toList();
+
+        List<TextPageBlock> newlyClassifiedHeadlines = new ArrayList<>(allHeadlines);
+        newlyClassifiedHeadlines.removeAll(headlinesFromOutlines);
+
+        TableOfContents toC = outlineValidationService.createToC(allHeadlines);
+        System.out.println(toC);
+
+        outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines);
+
    }


@ -48,7 +81,7 @@ public class RedactManagerClassificationService {

        var bodyTextFrame = page.getBodyTextFrame();

-        if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
+        if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
            return;
        }
        if (document.getFontSizeCounter().getMostPopular() == null) {
@ -62,33 +95,30 @@ public class RedactManagerClassificationService {
                .anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
            textBlock.setClassification(PageBlockType.PARAGRAPH);
            return;
-        }
-
-        if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
-                textBlock,
-                page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
+        }        if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
+            || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
+                                                                                                   || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
                .getMostPopular())) {
            textBlock.setClassification(PageBlockType.HEADER);

-        } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
-                textBlock,
-                page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
+        } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
+                   || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
+                                                                                                           || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
                .getMostPopular())) {
            textBlock.setClassification(PageBlockType.FOOTER);
-        } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
-                document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
-                .size() == 1)) {
+        } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
+                                                 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
            if (!Pattern.matches("[0-9]+", textBlock.toString())) {
                textBlock.setClassification(PageBlockType.TITLE);
            }
-        } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
-                .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
-                .getCountPerValue()
-                .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
-                .get(0)
-                .getTextPositions()
-                .get(0)
-                .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
+        } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
+                   && PositionUtils.getApproxLineCount(textBlock) < 4.9
+                   && (textBlock.getMostPopularWordStyle().equals("bold")
+                       || !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
+                          && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
+                   && textBlock.getSequences()
+                              .get(0).getTextPositions()
+                              .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {

            for (int i = 1; i <= headlineFontSizes.size(); i++) {
                if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
@ -96,25 +126,31 @@ public class RedactManagerClassificationService {
                    document.setHeadlines(true);
                }
            }
-        } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
-                .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
-                .get(0)
-                .getTextPositions()
-                .get(0)
-                .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
+        } else if (!textBlock.getText().startsWith("Figure ")
+                   && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordStyle().equals("bold")
+                   && !document.getFontStyleCounter().getMostPopular().equals("bold")
+                   && PositionUtils.getApproxLineCount(textBlock) < 2.9
+                   && textBlock.getSequences()
+                              .get(0).getTextPositions()
+                              .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
            textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
            document.setHeadlines(true);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
-                .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
+        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
+                   && textBlock.getMostPopularWordStyle().equals("bold")
+                   && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
-                .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
-                .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
+        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
+                   && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
+                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
            textBlock.setClassification(PageBlockType.PARAGRAPH);
-        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
-                .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
-                .getMostPopular()
-                .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
+        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
+                   && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
+                   && textBlock.getMostPopularWordStyle().equals("italic")
+                   && !document.getFontStyleCounter().getMostPopular().equals("italic")
+                   && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
        } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
            textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java
@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
 import org.junit.jupiter.api.Test;
 import org.springframework.beans.factory.annotation.Autowired;

-import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
+import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
 import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
 import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;