CLARI-002: fix some stuff with DocumentDataParser

* still todo, exlude semanticNodes inside TableCells
Clari-002: markdown parser for documentData
2024-07-10 19:48:42 +02:00 · 2024-07-09 13:45:44 +02:00 · 2024-07-09 11:02:28 +02:00 · 2024-07-08 13:38:40 +02:00 · 2024-06-24 17:51:05 +02:00
35 changed files with 1769 additions and 310 deletions
--- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/IndexData.java
+++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/IndexData.java
@ -0,0 +1,11 @@
+package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd;
+
+import java.util.List;
+import java.util.Map;
+
+public class IndexData {
+
+    Map<String, String> identifier;
+    List<TextChunk> textChunks;
+
+}
--- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/TextChunk.java
+++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/TextChunk.java
@ -0,0 +1,18 @@
+package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd;
+
+import java.util.List;
+
+import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
+
+import lombok.AccessLevel;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.experimental.FieldDefaults;
+
+@Builder
+@AllArgsConstructor
+@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
+public class TextChunk {
+
+    String text;
+}
--- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java
+++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java
@ -10,36 +10,25 @@ import lombok.NonNull;
@Builder
@Schema(description = "Object containing all storage paths the service needs to know.")
 public record LayoutParsingRequest(
-        @Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
        @NonNull LayoutParsingType layoutParsingType,

-        @Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
        Map<String, String> identifier,

-        @Schema(description = "Path to the original PDF file.")//
-        @NonNull String originFileStorageId,//
+        @NonNull String originFileStorageId,

+        Optional<String> tablesFileStorageId,
+        Optional<String> imagesFileStorageId,

-        @Schema(description = "Optional Path to the table extraction file.")//
-        Optional<String> tablesFileStorageId,//
-        @Schema(description = "Optional Path to the image classification file.")//
-        Optional<String> imagesFileStorageId,//
+        Optional<String> visualLayoutParsingFileId,

-        @Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
-
-        @Schema(description = "Path where the Document Structure File will be stored.")//
-        @NonNull String structureFileStorageId,//
-        @Schema(description = "Path where the Research Data File will be stored.")//
-        String researchDocumentStorageId,//
-        @Schema(description = "Path where the Document Text File will be stored.")//
-        @NonNull String textBlockFileStorageId,//
-        @Schema(description = "Path where the Document Positions File will be stored.")//
-        @NonNull String positionBlockFileStorageId,//
-        @Schema(description = "Path where the Document Pages File will be stored.")//
-        @NonNull String pageFileStorageId,//
-        @Schema(description = "Path where the Simplified Text File will be stored.")//
-        @NonNull String simplifiedTextStorageId,//
-        @Schema(description = "Path where the Viewer Document PDF will be stored.")//
-        @NonNull String viewerDocumentStorageId) {
+        @NonNull String structureFileStorageId,
+        String researchDocumentStorageId,
+        String markdownDocumentStorageId,
+        @NonNull String textBlockFileStorageId,
+        @NonNull String positionBlockFileStorageId,
+        @NonNull String pageFileStorageId,
+        @NonNull String simplifiedTextStorageId,
+        @NonNull String viewerDocumentStorageId
+) {

 }
--- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java
+++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java
@ -8,5 +8,6 @@ public enum LayoutParsingType {
    DOCUMINE,
    DOCUMINE_OLD,
    CLARIFYND,
-    CLARIFYND_PARAGRAPH_DEBUG
+    CLARIFYND_PARAGRAPH_DEBUG,
+    MARKDOWN
 }
--- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts
+++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts
@ -26,4 +26,10 @@ dependencies {
    implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
    implementation("org.jgrapht:jgrapht-core:1.5.2")
    implementation("org.tinspin:tinspin-indexes:2.1.3")
+    implementation("org.commonmark:commonmark:0.22.0")
+    implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
+    implementation("com.didalgo:gpt3-tokenizer:0.1.8")
+
+    implementation("org.mapstruct:mapstruct:1.5.5.Final")
+    annotationProcessor("org.mapstruct:mapstruct-processor:1.5.5.Final")
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -4,6 +4,7 @@ import static java.lang.String.format;

 import java.awt.geom.Point2D;
 import java.awt.geom.Rectangle2D;
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
@ -18,12 +19,15 @@ import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
+import org.commonmark.ext.gfm.tables.TablesExtension;
+import org.commonmark.renderer.markdown.MarkdownRenderer;
 import org.springframework.stereotype.Service;

 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
+import com.knecon.fforesight.service.layoutparser.processor.markdown.DocumentDataParser;
 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
@ -120,24 +124,18 @@ public class LayoutParsingPipeline {
        File viewerDocumentFile = originFile;

        VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
-        if (layoutParsingRequest.visualLayoutParsingFileId()
-                .isPresent()) {
-            visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
-                                                                                                         .get());
+        if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
+            visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
        }

        ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
-        if (layoutParsingRequest.imagesFileStorageId()
-                .isPresent()) {
-            imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
-                                                                                     .get());
+        if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
+            imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
        }

        TableServiceResponse tableServiceResponse = new TableServiceResponse();
-        if (layoutParsingRequest.tablesFileStorageId()
-                .isPresent()) {
-            tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
-                                                                                     .get());
+        if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
+            tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
        }

        ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
@ -163,12 +161,22 @@ public class LayoutParsingPipeline {
        layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
        layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);

-        if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) {
+        if (layoutParsingRequest.researchDocumentStorageId() != null) {
            log.info("Building research document data for {}", layoutParsingRequest.identifier());
            var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
            layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
        }

+        if (layoutParsingRequest.markdownDocumentStorageId() != null) {
+            log.info("Rendering document data as markdown for {}", layoutParsingRequest.identifier());
+            var markdownDocument = DocumentDataParser.parse(documentGraph.streamAllSubNodes());
+            MarkdownRenderer renderer = MarkdownRenderer.builder().extensions(List.of(TablesExtension.create())).build();
+            String markdown = renderer.render(markdownDocument);
+            try (var in = new ByteArrayInputStream(markdown.getBytes())) {
+                layoutParsingStorageService.storeObject(layoutParsingRequest.markdownDocumentStorageId(), in);
+            }
+        }
+
        if (!viewerDocumentFile.equals(originFile)) {
            viewerDocumentFile.delete();
        }
@ -254,7 +262,7 @@ public class LayoutParsingPipeline {
        OutlineObject lastProcessedOutlineObject = null;

        // parsing the structure elements could be useful as well
-        if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
+        if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
            classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
        }

@ -302,13 +310,9 @@ public class LayoutParsingPipeline {

            TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);

-            List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
-                                                                              pdPage,
-                                                                              pageNumber,
-                                                                              cleanRulings,
-                                                                              stripper.getTextPositionSequences(),
+            List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(),

-                                                                              false);
+                                                                                    false);

            pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
                    .addAll(graphics.stream()
@ -319,16 +323,11 @@ public class LayoutParsingPipeline {
                case REDACT_MANAGER_OLD ->
                        redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
                case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
-                case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
-                                                                                                                                                                    cleanRulings,
-                                                                                                                                                                    true,
-                                                                                                                                                                    classificationDocument.getVisualizations(),
-                                                                                                                                                                    layoutParsingType);
-                case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
-                                                                                                    cleanRulings,
-                                                                                                    false,
-                                                                                                    classificationDocument.getVisualizations(),
-                                                                                                    layoutParsingType);
+                case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
+                        docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
+                case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
+                        docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
+                default -> throw new IllegalArgumentException("Unexpected LayoutParsingType: " + layoutParsingType);
            };

            classificationPage.setCleanRulings(cleanRulings);
@ -338,7 +337,7 @@ public class LayoutParsingPipeline {
            classificationPage.setPageWidth(cropbox.getWidth());
            classificationPage.setPageHeight(cropbox.getHeight());

-            if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
+            if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
                List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());

                OutlineObject notFoundOutlineObject = null;
@ -387,8 +386,8 @@ public class LayoutParsingPipeline {
        }
        log.info("Classify TextBlocks for {}", identifier);
        switch (layoutParsingType) {
-            case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
-                    classificationDocument);
+            case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
+                    redactManagerClassificationService.classifyDocument(classificationDocument);
            case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
            case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
        }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java
@ -102,6 +102,11 @@ public class LayoutParsingStorageService {
        storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
    }

+    public void storeObject(String storageId, InputStream in) {
+
+        storageService.storeObject(TenantContext.getTenantId(), storageId, in);
+    }
+

    private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/MarkdownParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/MarkdownParsingPipeline.java
@ -0,0 +1,74 @@
+package com.knecon.fforesight.service.layoutparser.processor;
+
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.commonmark.Extension;
+import org.commonmark.ext.gfm.tables.TablesExtension;
+import org.commonmark.node.Document;
+import org.commonmark.node.Node;
+import org.commonmark.parser.Parser;
+import org.commonmark.renderer.Renderer;
+import org.commonmark.renderer.markdown.MarkdownRenderer;
+import org.springframework.stereotype.Service;
+
+import com.iqser.red.storage.commons.service.StorageService;
+import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
+import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
+import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownChunker;
+import com.knecon.fforesight.tenantcommons.TenantContext;
+
+import io.micrometer.observation.annotation.Observed;
+import lombok.AccessLevel;
+import lombok.RequiredArgsConstructor;
+import lombok.SneakyThrows;
+import lombok.experimental.FieldDefaults;
+
+@Service
+@RequiredArgsConstructor
+@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
+public class MarkdownParsingPipeline {
+
+    StorageService storageService;
+
+
+    @SneakyThrows
+    @Observed(name = "MarkdownParsingPipeline", contextualName = "parse-markdown")
+    public LayoutParsingFinishedEvent parseMarkdownAndSaveToStorage(LayoutParsingRequest request) {
+
+        long start = System.currentTimeMillis();
+        String markdown;
+        try (var in = storageService.getObject(TenantContext.getTenantId(), request.originFileStorageId()).getInputStream()) {
+            markdown = new String(in.readAllBytes(), StandardCharsets.UTF_8);
+        }
+        Parser parser = buildParser();
+        Node node = parser.parse(markdown);
+
+        MarkdownChunker chunker = new MarkdownChunker(600);
+
+        node.accept(chunker);
+
+        Renderer renderer = buildRenderer();
+        List<Document> markdownChunks = chunker.getResult();
+        for (Document markdownChunk : markdownChunks) {
+
+        }
+
+        return LayoutParsingFinishedEvent.builder().identifier(request.identifier()).numberOfPages(1).duration(System.currentTimeMillis() - start).build();
+    }
+
+
+    public static Parser buildParser() {
+
+        List<Extension> extensions = List.of(TablesExtension.create());
+        return Parser.builder().extensions(extensions).build();
+    }
+
+
+    public static MarkdownRenderer buildRenderer() {
+
+        List<Extension> extensions = List.of(TablesExtension.create());
+        return MarkdownRenderer.builder().extensions(extensions).build();
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java
@ -0,0 +1,305 @@
+package com.knecon.fforesight.service.layoutparser.processor.markdown;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.stream.Stream;
+
+import org.commonmark.ext.gfm.tables.TableBlock;
+import org.commonmark.ext.gfm.tables.TableBody;
+import org.commonmark.ext.gfm.tables.TableCell;
+import org.commonmark.ext.gfm.tables.TableHead;
+import org.commonmark.ext.gfm.tables.TableRow;
+import org.commonmark.node.Document;
+import org.commonmark.node.Emphasis;
+import org.commonmark.node.HardLineBreak;
+import org.commonmark.node.Heading;
+import org.commonmark.node.Node;
+import org.commonmark.node.SoftLineBreak;
+import org.commonmark.node.StrongEmphasis;
+import org.commonmark.node.Text;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
+
+import lombok.experimental.UtilityClass;
+
+@UtilityClass
+public class DocumentDataParser {
+
+    public Document parse(Stream<SemanticNode> semanticNodes) {
+
+        Document document = new Document();
+        semanticNodes.map(DocumentDataParser::parseNode)
+                .filter(Objects::nonNull)
+                .forEach(document::appendChild);
+        return document;
+    }
+
+
+    private Node parseNode(SemanticNode semanticNode) {
+
+        return switch (semanticNode.getType()) {
+            case HEADLINE -> parseHeadline((Headline) semanticNode);
+            case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
+            case TABLE -> parseTable((Table) semanticNode);
+            default -> null;
+        };
+    }
+
+
+    private TableBlock parseTable(Table table) {
+
+        TableBlock tableNode = new TableBlock();
+        TableHead head = new TableHead();
+        TableRow tableRow = createTableRow(table, 0);
+        head.appendChild(tableRow);
+        int row = 1;
+        for (; row < table.getNumberOfRows() && table.streamRow(row)
+                .allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) {
+            head.appendChild(createTableRow(table, row));
+        }
+        tableNode.appendChild(head);
+        TableBody tableBody = new TableBody();
+        for (; row < table.getNumberOfRows(); row++) {
+            tableBody.appendChild(createTableRow(table, row));
+        }
+        tableNode.appendChild(tableBody);
+        return tableNode;
+    }
+
+
+    private TableRow createTableRow(Table table, int row) {
+
+        TableRow tableRow = new TableRow();
+        table.streamRow(row)
+                .map(DocumentDataParser::createTableCell)
+                .forEach(tableRow::appendChild);
+        return tableRow;
+    }
+
+
+    private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
+
+        var cell = new TableCell();
+        parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
+        return cell;
+    }
+
+
+    private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
+
+        org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
+        parseTextBlockWithLineBreaks(paragraph.getTextBlock()).forEach(heading::appendChild);
+        return heading;
+    }
+
+
+    private Heading parseHeadline(Headline headline) {
+
+        Heading heading = new Heading();
+        heading.setLevel(headline.getTreeId().size());
+        parseTextBlockWithLineBreaks(headline.getTextBlock()).forEach(heading::appendChild);
+        return heading;
+
+    }
+
+    private List<Node> parseTextBlockWithLineBreaks(TextBlock textBlock) {
+
+        LinkedList<Node> result = new LinkedList<>();
+        List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
+        for (TextRangeWithTextType textRange : textRanges) {
+            if (textBlock.subSequenceWithLineBreaks(textRange.textRange()).equals("\n")) {
+                result.add(new HardLineBreak());
+            }
+            String text = textBlock.subSequenceWithLineBreaks(textRange.textRange());
+            String[] lines = text.split("\n");
+            for (String line : lines) {
+                String cleanedLine = line.trim();
+                if (cleanedLine.isEmpty()) {
+                    result.add(new HardLineBreak());
+                    continue;
+                }
+                switch (textRange.fontStyle()) {
+                    case REGULAR -> result.add(new Text(cleanedLine));
+                    case BOLD -> {
+                        StrongEmphasis boldBlock = new StrongEmphasis();
+                        boldBlock.appendChild(new Text(cleanedLine));
+                        result.add(boldBlock);
+                    }
+                    case ITALIC -> {
+                        Emphasis italicBlock = new Emphasis("_");
+                        italicBlock.appendChild(new Text(cleanedLine));
+                        result.add(italicBlock);
+                    }
+                    case BOLD_ITALIC -> {
+                        Emphasis italicBlock = new Emphasis("_");
+
+                        StrongEmphasis boldBlock = new StrongEmphasis();
+                        boldBlock.appendChild(new Text(cleanedLine));
+
+                        italicBlock.appendChild(boldBlock);
+                        result.add(italicBlock);
+                    }
+                }
+                result.add(new HardLineBreak());
+            }
+        }
+        result.removeLast();
+        return result;
+    }
+
+
+    private List<Node> parseTextBlock(TextBlock textBlock) {
+
+        List<Node> result = new ArrayList<>();
+        List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
+        for (TextRangeWithTextType textRange : textRanges) {
+            switch (textRange.fontStyle()) {
+                case REGULAR -> result.add(new Text(textBlock.subSequence(textRange.textRange()).toString()));
+                case BOLD -> {
+                    StrongEmphasis boldBlock = new StrongEmphasis();
+                    boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
+                    result.add(boldBlock);
+                }
+                case ITALIC -> {
+                    Emphasis italicBlock = new Emphasis("_");
+                    italicBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
+                    result.add(italicBlock);
+                }
+                case BOLD_ITALIC -> {
+                    Emphasis italicBlock = new Emphasis("_");
+
+                    StrongEmphasis boldBlock = new StrongEmphasis();
+                    boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
+
+                    italicBlock.appendChild(boldBlock);
+                    result.add(italicBlock);
+                }
+            }
+        }
+        return result;
+    }
+
+
+    private List<TextRangeWithTextType> mergeTextStyles(TextBlock textBlock) {
+
+        List<TextRangeWithTextType> result = new ArrayList<>();
+
+        TreeMap<Integer, Set<FontStyleChange>> styleChanges = new TreeMap<>();
+
+        int start = textBlock.getTextRange().start();
+        int end = textBlock.getTextRange().end();
+
+        for (TextRange bold : textBlock.getBoldTextBoundaries()) {
+            styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
+            styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
+        }
+
+        for (TextRange italic : textBlock.getItalicTextBoundaries()) {
+            styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
+            styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
+        }
+
+        if (styleChanges.isEmpty()) {
+            result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR));
+            return result;
+        }
+
+        Set<FontStyle> currentStyles = new HashSet<>();
+        currentStyles.add(FontStyle.REGULAR);
+
+        for (Map.Entry<Integer, Set<FontStyleChange>> entry : styleChanges.entrySet()) {
+            int point = entry.getKey();
+            Set<FontStyleChange> changes = entry.getValue();
+
+            if (point > start) {
+                FontStyle style = determineFontStyle(currentStyles);
+                result.add(new TextRangeWithTextType(new TextRange(start, point), style));
+            }
+
+            changes.stream()
+                    .filter(FontStyleChange::leave)
+                    .map(FontStyleChange::style)
+                    .toList()
+                    .forEach(currentStyles::remove);
+
+            currentStyles.addAll(changes.stream()
+                                         .filter(FontStyleChange::enter)
+                                         .map(FontStyleChange::style)
+                                         .toList());
+
+            if (currentStyles.isEmpty()) {
+                currentStyles.add(FontStyle.REGULAR);
+            }
+
+            start = point;
+        }
+
+        if (start < end) {
+            FontStyle style = determineFontStyle(currentStyles);
+            result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style));
+        }
+
+        return result.stream()
+                .filter(t -> t.textRange.length() > 1)
+                .toList();
+    }
+
+
+    private FontStyle determineFontStyle(Set<FontStyle> styles) {
+
+        if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) {
+            return FontStyle.BOLD_ITALIC;
+        } else if (styles.contains(FontStyle.BOLD)) {
+            return FontStyle.BOLD;
+        } else if (styles.contains(FontStyle.ITALIC)) {
+            return FontStyle.ITALIC;
+        } else {
+            return FontStyle.REGULAR;
+        }
+    }
+
+
+    enum FontStyle {
+        REGULAR,
+        BOLD,
+        ITALIC,
+        BOLD_ITALIC;
+    }
+
+    record FontStyleChange(boolean enter, FontStyle style) {
+
+        public static FontStyleChange enter(FontStyle style) {
+
+            return new FontStyleChange(true, style);
+        }
+
+
+        public static FontStyleChange leave(FontStyle style) {
+
+            return new FontStyleChange(false, style);
+        }
+
+
+        public boolean leave() {
+
+            return !enter;
+        }
+
+    }
+
+    record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
+
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ExtraTokens.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ExtraTokens.java
@ -0,0 +1,13 @@
+package com.knecon.fforesight.service.layoutparser.processor.markdown;
+
+import org.commonmark.node.IndentedCodeBlock;
+import org.commonmark.node.Paragraph;
+
+import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter;
+
+public class ExtraTokens {
+
+    public static int INDENTED_CODE_BLOCK = 10;
+    public static int PARAGRAPH = 10;
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownChunker.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownChunker.java
@ -0,0 +1,527 @@
+package com.knecon.fforesight.service.layoutparser.processor.markdown;
+
+import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer;
+import static com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter.countTokens;
+
+import java.text.BreakIterator;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.stream.Collectors;
+
+import org.commonmark.ext.gfm.tables.TableBlock;
+import org.commonmark.ext.gfm.tables.TableBody;
+import org.commonmark.node.AbstractVisitor;
+import org.commonmark.node.BlockQuote;
+import org.commonmark.node.BulletList;
+import org.commonmark.node.Code;
+import org.commonmark.node.CustomBlock;
+import org.commonmark.node.CustomNode;
+import org.commonmark.node.Document;
+import org.commonmark.node.Emphasis;
+import org.commonmark.node.FencedCodeBlock;
+import org.commonmark.node.HardLineBreak;
+import org.commonmark.node.Heading;
+import org.commonmark.node.HtmlBlock;
+import org.commonmark.node.HtmlInline;
+import org.commonmark.node.Image;
+import org.commonmark.node.IndentedCodeBlock;
+import org.commonmark.node.Link;
+import org.commonmark.node.LinkReferenceDefinition;
+import org.commonmark.node.ListBlock;
+import org.commonmark.node.ListItem;
+import org.commonmark.node.Node;
+import org.commonmark.node.OrderedList;
+import org.commonmark.node.Paragraph;
+import org.commonmark.node.SoftLineBreak;
+import org.commonmark.node.StrongEmphasis;
+import org.commonmark.node.Text;
+import org.commonmark.node.ThematicBreak;
+import org.commonmark.renderer.Renderer;
+
+import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter;
+
+import lombok.AccessLevel;
+import lombok.experimental.FieldDefaults;
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+@FieldDefaults(level = AccessLevel.PRIVATE)
+public class MarkdownChunker extends AbstractVisitor {
+
+    NodeCopier nodeCopier = NodeCopier.INSTANCE;
+
+    final int tokenLimit;
+    List<Document> allChunks;
+
+    Deque<Heading> currentHeadings;
+    Document chunk;
+    boolean validChunk;
+
+
+    public MarkdownChunker(int tokenLimit) {
+
+        this.tokenLimit = tokenLimit;
+        allChunks = new LinkedList<>();
+        currentHeadings = new LinkedList<>();
+        startNewChunk();
+    }
+
+
+    public List<Document> getResult() {
+
+        for (Document chunk : allChunks) {
+            if (countTokens(chunk) > tokenLimit) {
+                throwUnsplittableNodeError(chunk);
+            }
+        }
+        return allChunks;
+    }
+
+
+    @Override
+    public void visit(Heading heading) {
+
+        if (heading.getLevel() > 4) {
+            addToChunk(heading);
+        }
+        if (currentHeadings.isEmpty() || currentHeadings.peek().getLevel() < heading.getLevel()) {
+            currentHeadings.push(heading);
+        } else {
+            while (!currentHeadings.isEmpty() && currentHeadings.peek().getLevel() >= heading.getLevel()) {
+                currentHeadings.pop();
+            }
+            currentHeadings.push(heading);
+        }
+
+        startNewChunk();
+    }
+
+
+    private void startNewChunk() {
+
+        if (!validChunk && !allChunks.isEmpty()) {
+            allChunks.remove(allChunks.size() - 1);
+        }
+        validChunk = false;
+        chunk = buildNewChunk();
+        allChunks.add(chunk);
+    }
+
+
+    private Document buildNewChunk() {
+
+        Document document = new Document();
+        List<Node> headingCopies = currentHeadings.stream()
+                .map(nodeCopier::copyNodeWithChildren)
+                .collect(Collectors.toList());
+        Collections.reverse(headingCopies);
+        headingCopies.forEach(document::appendChild);
+        return document;
+    }
+
+
+    public int currentTokenCount() {
+
+        return TokenCounter.countTokens(chunk);
+    }
+
+
+    public boolean fitsTokenLimit(Node node) {
+
+        Document document = buildNewChunk();
+        document.appendChild(nodeCopier.copyNodeWithChildren(node));
+        return TokenCounter.countTokens(document) <= tokenLimit;
+    }
+
+
+    private void addToChunk(Node node) {
+
+        chunk.appendChild(node);
+
+        if (currentTokenCount() <= tokenLimit) {
+            return;
+        }
+
+        node.unlink();
+        startNewChunk();
+        chunk.appendChild(node);
+
+        if (currentTokenCount() > tokenLimit) { // node is too large and won't fit in tokenLimit, split is necessary
+            node.unlink();
+            startNewChunk();
+            splitNodeAndAddToChunk(node);
+            return;
+        }
+
+        validChunk = true;
+
+    }
+
+
+    private void splitNodeAndAddToChunk(Node node) {
+
+        if (node instanceof TableBlock tableBlock) {
+            splitTable(tableBlock);
+            return;
+        } else if (node instanceof BulletList bulletList) {
+            splitList(bulletList);
+            return;
+        } else if (node instanceof OrderedList orderedList) {
+            splitList(orderedList);
+            return;
+        } else if (node instanceof Paragraph paragraph) {
+            splitParagraph(paragraph);
+            return;
+        } else if (node instanceof IndentedCodeBlock indentedCodeBlock) {
+            splitCodeBlock(indentedCodeBlock);
+            return;
+        }
+        throwUnsplittableNodeError(node);
+    }
+
+
+    private void splitCodeBlock(IndentedCodeBlock indentedCodeBlock) {
+
+        List<IndentedCodeBlock> splitBlocks = new LinkedList<>();
+        StringBuilder sb = new StringBuilder();
+        BreakIterator lineIterator = BreakIterator.getLineInstance(Locale.ENGLISH);
+        lineIterator.setText(indentedCodeBlock.getLiteral());
+        int start = lineIterator.first();
+        for (int end = lineIterator.next(); end != BreakIterator.DONE; start = end, end = lineIterator.next()) {
+            String sentence = indentedCodeBlock.getLiteral().substring(start, end);
+            if (!fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) {
+                sb.replace(sb.length() - sentence.length(), sb.length(), "");
+                IndentedCodeBlock block = buildIndentedCodeBlock(sb.toString());
+                splitBlocks.add(block);
+                sb = new StringBuilder();
+            }
+            sb.append(sentence);
+        }
+        if (!sb.isEmpty()) {
+            if (fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) {
+                splitBlocks.add(buildIndentedCodeBlock(sb.toString()));
+            } else {
+                int mid = sb.length() / 2;
+                splitBlocks.add(buildIndentedCodeBlock(sb.substring(0, mid)));
+                splitBlocks.add(buildIndentedCodeBlock(sb.substring(mid, sb.length())));
+            }
+        }
+
+        splitBlocks.forEach(this::addToChunk);
+    }
+
+
+    private static IndentedCodeBlock buildIndentedCodeBlock(String string) {
+
+        IndentedCodeBlock block = new IndentedCodeBlock();
+        block.setLiteral(string);
+        return block;
+    }
+
+
+    private void splitParagraph(Paragraph paragraph) {
+
+        if (fitsTokenLimit(paragraph)) {
+            addToChunk(paragraph);
+            return;
+        }
+
+        List<Node> children = collectAllChildNodes(paragraph);
+
+        if (children.size() == 1) {
+            if (children.get(0) instanceof Text text) {
+                List<Text> splitTexts = splitText(text);
+                for (Text splitText : splitTexts) {
+                    Paragraph paragraph1 = new Paragraph();
+                    paragraph1.appendChild(splitText);
+                    addToChunk(paragraph1);
+                }
+                return;
+            }
+            throwUnsplittableNodeError(children.get(0));
+        }
+
+        Paragraph paragraph1 = new Paragraph();
+        Paragraph paragraph2 = new Paragraph();
+
+        int mid = children.size() / 2;
+        children.subList(0, mid)
+                .forEach(paragraph1::appendChild);
+        children.subList(mid, children.size())
+                .forEach(paragraph2::appendChild);
+
+        splitParagraph(paragraph1);
+        splitParagraph(paragraph2);
+    }
+
+
+    private void throwUnsplittableNodeError(Node node) {
+
+        Renderer renderer = buildRenderer();
+        String renderedNode = renderer.render(node);
+        log.error(renderedNode);
+        throw new IllegalArgumentException(String.format("Node %s exceeds token limit (%d/%d) and can't be split!", node, countTokens(renderedNode), tokenLimit));
+    }
+
+
+    private static List<Node> collectAllChildNodes(Node parent) {
+
+        List<Node> children = new LinkedList<>();
+        Node next;
+        for (Node child = parent.getFirstChild(); child != null; child = next) {
+            next = child.getNext();
+            children.add(child);
+        }
+        return children;
+    }
+
+
+    private List<Text> splitText(Text text) {
+
+        List<Text> splitTexts = new LinkedList<>();
+        StringBuilder sb = new StringBuilder();
+        BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH);
+        sentenceIterator.setText(text.getLiteral());
+        int start = sentenceIterator.first();
+        for (int end = sentenceIterator.next(); end != BreakIterator.DONE; start = end, end = sentenceIterator.next()) {
+            String sentence = text.getLiteral().substring(start, end);
+            if (!fitsTokenLimit(buildParagraphWithText(sb))) {
+                sb.replace(sb.length() - sentence.length(), sb.length(), "");
+                splitTexts.add(new Text(sb.toString()));
+                sb = new StringBuilder();
+            }
+            sb.append(sentence);
+        }
+        if (!sb.isEmpty()) {
+            if (fitsTokenLimit(buildParagraphWithText(sb))) {
+                splitTexts.add(new Text(sb.toString()));
+            } else {
+                int mid = sb.length() / 2;
+                splitTexts.add(new Text(sb.substring(0, mid)));
+                splitTexts.add(new Text(sb.substring(mid, sb.length())));
+            }
+        }
+        return splitTexts;
+    }
+
+
+    private static Paragraph buildParagraphWithText(StringBuilder sb) {
+
+        Paragraph paragraph = new Paragraph();
+        paragraph.appendChild(new Text(sb.toString()));
+        return paragraph;
+    }
+
+
+    private void splitList(BulletList bulletList) {
+
+        if (fitsTokenLimit(bulletList)) {
+            addToChunk(bulletList);
+            return;
+        }
+
+        BulletList list1 = new BulletList();
+        BulletList list2 = new BulletList();
+
+        splitLists(bulletList, list1, list2);
+    }
+
+
+    private void splitList(OrderedList orderedList) {
+
+        if (fitsTokenLimit(orderedList)) {
+            addToChunk(orderedList);
+            return;
+        }
+
+        OrderedList list1 = new OrderedList();
+        OrderedList list2 = new OrderedList();
+
+        splitLists(orderedList, list1, list2);
+    }
+
+
+    private void splitLists(ListBlock originList, ListBlock list1, ListBlock list2) {
+
+        List<Node> listItems = collectAllChildNodes(originList);
+
+        if (listItems.size() == 1) {
+            collectAllChildNodes(listItems.get(0)).forEach(this::addToChunk);
+        }
+
+        int mid = listItems.size() / 2;
+        listItems.subList(0, mid)
+                .forEach(list1::appendChild);
+        listItems.subList(mid, listItems.size())
+                .forEach(list2::appendChild);
+
+        splitNodeAndAddToChunk(list1);
+        splitNodeAndAddToChunk(list2);
+    }
+
+
+    private void splitTable(TableBlock tableBlock) {
+
+        if (fitsTokenLimit(tableBlock)) {
+            addToChunk(tableBlock);
+            return;
+        }
+
+        TableBlock tableBlock1 = new TableBlock();
+        TableBlock tableBlock2 = new TableBlock();
+
+        tableBlock1.appendChild(nodeCopier.copy(tableBlock.getFirstChild()));
+        tableBlock2.appendChild(nodeCopier.copy(tableBlock.getFirstChild()));
+
+        TableBody tableBody1 = new TableBody();
+        TableBody tableBody2 = new TableBody();
+
+        List<Node> tableRows = collectAllChildNodes(tableBlock.getLastChild());
+
+        if (tableRows.isEmpty()) {
+            throw new IllegalArgumentException("The table headers already exceeds the token limit");
+        }
+        if (tableRows.size() == 1) {
+            throw new IllegalArgumentException("A single table row already exceeds the token limit");
+        }
+
+        int mid = tableRows.size() / 2;
+        tableRows.subList(0, mid)
+                .forEach(tableBody1::appendChild);
+        tableRows.subList(mid, tableRows.size())
+                .forEach(tableBody2::appendChild);
+
+        splitTable(tableBlock1);
+        splitTable(tableBlock2);
+    }
+
+
+    public void visit(BlockQuote blockQuote) {
+
+        this.addToChunk(blockQuote);
+    }
+
+
+    public void visit(BulletList bulletList) {
+
+        this.addToChunk(bulletList);
+    }
+
+
+    public void visit(Code code) {
+
+        this.addToChunk(code);
+    }
+
+
+    public void visit(Emphasis emphasis) {
+
+        this.addToChunk(emphasis);
+    }
+
+
+    public void visit(FencedCodeBlock fencedCodeBlock) {
+
+        this.addToChunk(fencedCodeBlock);
+    }
+
+
+    public void visit(HardLineBreak hardLineBreak) {
+
+        this.addToChunk(hardLineBreak);
+    }
+
+
+    public void visit(ThematicBreak thematicBreak) {
+
+        this.addToChunk(thematicBreak);
+    }
+
+
+    public void visit(HtmlInline htmlInline) {
+
+        this.addToChunk(htmlInline);
+    }
+
+
+    public void visit(HtmlBlock htmlBlock) {
+
+        this.addToChunk(htmlBlock);
+    }
+
+
+    public void visit(Image image) {
+
+        this.addToChunk(image);
+    }
+
+
+    public void visit(IndentedCodeBlock indentedCodeBlock) {
+
+        this.addToChunk(indentedCodeBlock);
+    }
+
+
+    public void visit(Link link) {
+
+        this.addToChunk(link);
+    }
+
+
+    public void visit(ListItem listItem) {
+
+        this.addToChunk(listItem);
+    }
+
+
+    public void visit(OrderedList orderedList) {
+
+        this.addToChunk(orderedList);
+    }
+
+
+    public void visit(Paragraph paragraph) {
+
+        this.addToChunk(paragraph);
+    }
+
+
+    public void visit(SoftLineBreak softLineBreak) {
+
+        this.addToChunk(softLineBreak);
+    }
+
+
+    public void visit(StrongEmphasis strongEmphasis) {
+
+        this.addToChunk(strongEmphasis);
+    }
+
+
+    public void visit(Text text) {
+
+        this.addToChunk(text);
+    }
+
+
+    public void visit(LinkReferenceDefinition linkReferenceDefinition) {
+
+        this.addToChunk(linkReferenceDefinition);
+    }
+
+
+    public void visit(CustomBlock customBlock) {
+
+        this.addToChunk(customBlock);
+    }
+
+
+    public void visit(CustomNode customNode) {
+
+        this.addToChunk(customNode);
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/NodeCopier.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/NodeCopier.java
@ -0,0 +1,171 @@
+package com.knecon.fforesight.service.layoutparser.processor.markdown;
+
+import org.commonmark.ext.gfm.tables.TableBlock;
+import org.commonmark.ext.gfm.tables.TableBody;
+import org.commonmark.ext.gfm.tables.TableCell;
+import org.commonmark.ext.gfm.tables.TableHead;
+import org.commonmark.ext.gfm.tables.TableRow;
+import org.commonmark.node.BlockQuote;
+import org.commonmark.node.BulletList;
+import org.commonmark.node.Code;
+import org.commonmark.node.Document;
+import org.commonmark.node.Emphasis;
+import org.commonmark.node.FencedCodeBlock;
+import org.commonmark.node.HardLineBreak;
+import org.commonmark.node.Heading;
+import org.commonmark.node.HtmlBlock;
+import org.commonmark.node.HtmlInline;
+import org.commonmark.node.Image;
+import org.commonmark.node.IndentedCodeBlock;
+import org.commonmark.node.Link;
+import org.commonmark.node.LinkReferenceDefinition;
+import org.commonmark.node.ListItem;
+import org.commonmark.node.Node;
+import org.commonmark.node.OrderedList;
+import org.commonmark.node.Paragraph;
+import org.commonmark.node.SoftLineBreak;
+import org.commonmark.node.StrongEmphasis;
+import org.commonmark.node.Text;
+import org.commonmark.node.ThematicBreak;
+import org.mapstruct.Mapper;
+import org.mapstruct.Mapping;
+import org.mapstruct.factory.Mappers;
+
+@Mapper
+public interface NodeCopier {
+
+    NodeCopier INSTANCE = Mappers.getMapper(NodeCopier.class);
+
+
+    default Node copyNodeWithChildren(Node node) {
+
+        Node copy = copy(node);
+
+        Node next;
+        for (Node child = node.getFirstChild(); child != null; child = next) {
+            next = child.getNext();
+            copy.appendChild(copyNodeWithChildren(child));
+        }
+        return copy;
+    }
+
+
+    default Node copy(Node node) {
+
+        return switch (node.getClass().getSimpleName()) {
+            case "BlockQuote" -> copy((BlockQuote) node);
+            case "BulletList" -> copy((BulletList) node);
+            case "Code" -> copy((Code) node);
+            case "Document" -> copy((Document) node);
+            case "Emphasis" -> copy((Emphasis) node);
+            case "FencedCodeBlock" -> copy((FencedCodeBlock) node);
+            case "HardLineBreak" -> copy((HardLineBreak) node);
+            case "Heading" -> copy((Heading) node);
+            case "HtmlBlock" -> copy((HtmlBlock) node);
+            case "HtmlInline" -> copy((HtmlInline) node);
+            case "Image" -> copy((Image) node);
+            case "IndentedCodeBlock" -> copy((IndentedCodeBlock) node);
+            case "Link" -> copy((Link) node);
+            case "LinkReferenceDefinition" -> copy((LinkReferenceDefinition) node);
+            case "ListItem" -> copy((ListItem) node);
+            case "OrderedList" -> copy((OrderedList) node);
+            case "Paragraph" -> copy((Paragraph) node);
+            case "SoftLineBreak" -> copy((SoftLineBreak) node);
+            case "StrongEmphasis" -> copy((StrongEmphasis) node);
+            case "Text" -> copy((Text) node);
+            case "ThematicBreak" -> copy((ThematicBreak) node);
+            case "TableBlock" -> copy((TableBlock) node);
+            case "TableBody" -> copy((TableBody) node);
+            case "TableCell" -> copy((TableCell) node);
+            case "TableHead" -> copy((TableHead) node);
+            case "TableRow" -> copy((TableRow) node);
+            default -> throw new IllegalArgumentException("No copy method found for class: " + node.getClass().getName());
+        };
+    }
+
+
+    BlockQuote copy(BlockQuote blockQuote);
+
+
+    @Mapping(target = "bulletMarker", ignore = true)
+    BulletList copy(BulletList bulletList);
+
+
+    Code copy(Code code);
+
+
+    Document copy(Document document);
+
+
+    @Mapping(target = "delimiter", source = "openingDelimiter")
+    Emphasis copy(Emphasis emphasis);
+
+
+    @Mapping(target = "fenceChar", ignore = true)
+    @Mapping(target = "fenceLength", ignore = true)
+    FencedCodeBlock copy(FencedCodeBlock fencedCodeBlock);
+
+
+    HardLineBreak copy(HardLineBreak hardLineBreak);
+
+
+    Heading copy(Heading heading);
+
+
+    HtmlBlock copy(HtmlBlock htmlBlock);
+
+
+    HtmlInline copy(HtmlInline htmlInline);
+
+
+    Image copy(Image image);
+
+
+    IndentedCodeBlock copy(IndentedCodeBlock indentedCodeBlock);
+
+
+    Link copy(Link link);
+
+
+    LinkReferenceDefinition copy(LinkReferenceDefinition linkReferenceDefinition);
+
+
+    ListItem copy(ListItem listItem);
+
+
+    @Mapping(target = "startNumber", ignore = true)
+    @Mapping(target = "delimiter", ignore = true)
+    OrderedList copy(OrderedList orderedList);
+
+
+    Paragraph copy(Paragraph paragraph);
+
+
+    SoftLineBreak copy(SoftLineBreak softLineBreak);
+
+
+    @Mapping(target = "delimiter", source = "openingDelimiter")
+    StrongEmphasis copy(StrongEmphasis strongEmphasis);
+
+
+    Text copy(Text text);
+
+
+    ThematicBreak copy(ThematicBreak thematicBreak);
+
+
+    TableBlock copy(TableBlock tableBlock);
+
+
+    TableBody copy(TableBody tableBody);
+
+
+    TableCell copy(TableCell tableCell);
+
+
+    TableHead copy(TableHead tableHead);
+
+
+    TableRow copy(TableRow tableRow);
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ReflectionNodeCopier.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ReflectionNodeCopier.java
@ -0,0 +1,65 @@
+package com.knecon.fforesight.service.layoutparser.processor.markdown;
+
+import java.lang.reflect.Field;
+
+import org.commonmark.node.Node;
+
+public class ReflectionNodeCopier {
+
+    NodeCopier mapperNodeCopier;
+
+
+
+    public static Node copyNode(Node node) {
+
+        Node copy = deepCopy(node);
+        copyChildren(node, copy);
+        return copy;
+    }
+
+
+    private static void copyChildren(Node nodeToCopy, Node copy) {
+
+        Node next;
+        for (Node node = nodeToCopy.getFirstChild(); node != null; node = next) {
+            next = node.getNext();
+            copy.appendChild(copyNode(node));
+        }
+    }
+
+
+    private static <T> T deepCopy(T object) {
+
+        try {
+            Class<?> clazz = object.getClass();
+            T copy = (T) clazz.getDeclaredConstructor().newInstance();
+
+            for (Field field : clazz.getDeclaredFields()) {
+                field.setAccessible(true);
+                Object value = field.get(object);
+                if (isPrimitiveOrWrapper(field.getType()) || field.getType().equals(String.class)) {
+                    field.set(copy, value);
+                }
+            }
+            return copy;
+        } catch (Exception e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+
+    private static boolean isPrimitiveOrWrapper(Class<?> type) {
+
+        return type.isPrimitive()
+               || type == Boolean.class
+               || type == Byte.class
+               || type == Character.class
+               || type == Double.class
+               || type == Float.class
+               || type == Integer.class
+               || type == Long.class
+               || type == Short.class;
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java
@ -7,6 +7,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;

+import com.google.common.base.Functions;
+
 import lombok.Getter;

@Getter
@ -59,7 +61,9 @@ public class FloatFrequencyCounter {
            }
        }

-        return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
+        return higher.stream()
+                .sorted(Collections.reverseOrder())
+                .collect(Collectors.toList());
    }


@ -74,4 +78,16 @@ public class FloatFrequencyCounter {
        return highest;
    }

+
+    public double getAverage() {
+
+        double sum = countPerValue.keySet()
+                .stream()
+                .mapToDouble(fontSize -> fontSize * countPerValue.get(fontSize)).sum();
+        double count = countPerValue.values()
+                .stream()
+                .mapToInt(Integer::intValue).sum();
+        return sum / count;
+    }
+
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java
@ -13,13 +13,13 @@ import lombok.Setter;
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
-public class Boundary implements Comparable<Boundary> {
+public class TextRange implements Comparable<TextRange> {

    private int start;
    private int end;


-    public Boundary(int start, int end) {
+    public TextRange(int start, int end) {

        if (start > end) {
            throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
@ -47,15 +47,15 @@ public class Boundary implements Comparable<Boundary> {
    }


-    public boolean contains(Boundary boundary) {
+    public boolean contains(TextRange textRange) {

-        return start <= boundary.start() && boundary.end() <= end;
+        return start <= textRange.start() && textRange.end() <= end;
    }


-    public boolean containedBy(Boundary boundary) {
+    public boolean containedBy(TextRange textRange) {

-        return boundary.contains(this);
+        return textRange.contains(this);
    }


@ -83,18 +83,18 @@ public class Boundary implements Comparable<Boundary> {
    }


-    public boolean intersects(Boundary boundary) {
+    public boolean intersects(TextRange textRange) {

-        return boundary.start() < this.end && this.start < boundary.end();
+        return textRange.start() < this.end && this.start < textRange.end();
    }


-    public List<Boundary> split(List<Integer> splitIndices) {
+    public List<TextRange> split(List<Integer> splitIndices) {

        if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
            throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
        }
-        List<Boundary> splitBoundaries = new LinkedList<>();
+        List<TextRange> splitBoundaries = new LinkedList<>();
        int previousIndex = start;
        for (int splitIndex : splitIndices) {

@ -102,10 +102,10 @@ public class Boundary implements Comparable<Boundary> {
            if (splitIndex == previousIndex) {
                continue;
            }
-            splitBoundaries.add(new Boundary(previousIndex, splitIndex));
+            splitBoundaries.add(new TextRange(previousIndex, splitIndex));
            previousIndex = splitIndex;
        }
-        splitBoundaries.add(new Boundary(previousIndex, end));
+        splitBoundaries.add(new TextRange(previousIndex, end));
        return splitBoundaries;
    }

@ -114,11 +114,11 @@ public class Boundary implements Comparable<Boundary> {
        return IntStream.range(start, end);
    }

-    public static Boundary merge(Collection<Boundary> boundaries) {
+    public static TextRange merge(Collection<TextRange> boundaries) {

-        int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
-        int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
-        return new Boundary(minStart, maxEnd);
+        int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
+        int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
+        return new TextRange(minStart, maxEnd);
    }


@ -130,12 +130,12 @@ public class Boundary implements Comparable<Boundary> {


    @Override
-    public int compareTo(Boundary boundary) {
+    public int compareTo(TextRange textRange) {

-        if (end < boundary.end() && start < boundary.start()) {
+        if (end < textRange.end() && start < textRange.start()) {
            return -1;
        }
-        if (start > boundary.start() && end > boundary.end()) {
+        if (start > textRange.start() && end > textRange.end()) {
            return 1;
        }

--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java
@ -11,7 +11,7 @@ import java.util.Map;
 import java.util.Set;

 import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
 import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
@ -32,7 +32,7 @@ public class RedactionEntity {

    // initial values
    @EqualsAndHashCode.Include
-    final Boundary boundary;
+    final TextRange textRange;
    @EqualsAndHashCode.Include
    final String type;
    @EqualsAndHashCode.Include
@ -66,9 +66,9 @@ public class RedactionEntity {
    SemanticNode deepestFullyContainingNode;


-    public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
+    public static RedactionEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) {

-        return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
+        return RedactionEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build();
    }


@ -132,7 +132,7 @@ public class RedactionEntity {
    public List<RedactionPosition> getRedactionPositionsPerPage() {

        if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
-            Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
+            Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);

            Page firstPage = rectanglesPerLinePerPage.keySet()
                    .stream()
@ -157,19 +157,19 @@ public class RedactionEntity {

    public boolean containedBy(RedactionEntity redactionEntity) {

-        return this.boundary.containedBy(redactionEntity.getBoundary());
+        return this.textRange.containedBy(redactionEntity.getTextRange());
    }


    public boolean contains(RedactionEntity redactionEntity) {

-        return this.boundary.contains(redactionEntity.getBoundary());
+        return this.textRange.contains(redactionEntity.getTextRange());
    }


    public boolean intersects(RedactionEntity redactionEntity) {

-        return this.boundary.intersects(redactionEntity.getBoundary());
+        return this.textRange.intersects(redactionEntity.getTextRange());
    }


@ -210,7 +210,7 @@ public class RedactionEntity {
        sb.append("Entity[\"");
        sb.append(value);
        sb.append("\", ");
-        sb.append(boundary);
+        sb.append(textRange);
        sb.append(", pages[");
        pages.forEach(page -> {
            sb.append(page.getNumber());
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java
@ -60,8 +60,8 @@ public class Document extends AbstractSemanticNode {
     *
     * @return A list of main sections within the document
     * @deprecated This method is marked for removal.
-     *  Use {@link #streamChildrenOfType(NodeType)} instead,
-     *  or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
+     * Use {@link #streamChildrenOfType(NodeType)} instead,
+     * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
     */
    @Deprecated(forRemoval = true)
    public List<Section> getMainSections() {
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java
@ -14,7 +14,7 @@ import java.util.stream.Stream;

 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
@ -42,7 +42,9 @@ public interface SemanticNode {
     */
    default TextBlock getTextBlock() {

-        return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
+        return streamAllSubNodes().filter(SemanticNode::isLeaf)
+                .map(SemanticNode::getTextBlock)
+                .collect(new TextBlockCollector());
    }


@ -68,7 +70,10 @@ public interface SemanticNode {

    default Page getFirstPage() {

-        return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
+        return getTextBlock().getPages()
+                .stream()
+                .min(Comparator.comparingInt(Page::getNumber))
+                .orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
    }


@ -77,18 +82,19 @@ public interface SemanticNode {
     *
     * @return Set of PageNodes this node appears on.
     */
-    default Set<Page> getPages(Boundary boundary) {
+    default Set<Page> getPages(TextRange textRange) {

-        if (!getBoundary().contains(boundary)) {
-            throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
+        if (!getBoundary().contains(textRange)) {
+            throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary()));
        }
-        return getTextBlock().getPages(boundary);
+        return getTextBlock().getPages(textRange);
    }


    default boolean isOnPage(int pageNumber) {

-        return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
+        return getPages().stream()
+                .anyMatch(page -> page.getNumber() == pageNumber);
    }


@ -203,7 +209,9 @@ public interface SemanticNode {
     */
    default boolean hasEntitiesOfType(String type) {

-        return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
+        return getEntities().stream()
+                .filter(entity -> entity.getEntityType().equals(EntityType.ENTITY))
+                .anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
    }


@ -215,7 +223,9 @@ public interface SemanticNode {
     */
    default List<RedactionEntity> getEntitiesOfType(String type) {

-        return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
+        return getEntities().stream()
+                .filter(redactionEntity -> redactionEntity.getType().equals(type))
+                .toList();
    }


@ -227,7 +237,9 @@ public interface SemanticNode {
     */
    default List<RedactionEntity> getEntitiesOfType(List<String> types) {

-        return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
+        return getEntities().stream()
+                .filter(redactionEntity -> redactionEntity.isAnyType(types))
+                .toList();
    }


@ -241,7 +253,8 @@ public interface SemanticNode {

        TextBlock textBlock = getTextBlock();
        if (!textBlock.getAtomicTextBlocks().isEmpty()) {
-            return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
+            return getTextBlock().getAtomicTextBlocks()
+                    .get(0).getNumberOnPage();
        } else {
            return -1;
        }
@ -279,7 +292,8 @@ public interface SemanticNode {
     */
    default boolean containsStrings(List<String> strings) {

-        return strings.stream().allMatch(this::containsString);
+        return strings.stream()
+                .allMatch(this::containsString);
    }


@ -303,7 +317,8 @@ public interface SemanticNode {
     */
    default boolean containsAnyString(List<String> strings) {

-        return strings.stream().anyMatch(this::containsString);
+        return strings.stream()
+                .anyMatch(this::containsString);
    }


@ -315,7 +330,8 @@ public interface SemanticNode {
     */
    default boolean containsAnyStringIgnoreCase(List<String> strings) {

-        return strings.stream().anyMatch(this::containsStringIgnoreCase);
+        return strings.stream()
+                .anyMatch(this::containsStringIgnoreCase);
    }


@ -328,13 +344,13 @@ public interface SemanticNode {
    default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {

        TextBlock textBlock = getTextBlock();
-        if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
-            if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
+        if (textBlock.getTextRange().intersects(redactionEntity.getTextRange())) {
+            if (textBlock.containsBoundary(redactionEntity.getTextRange())) {
                redactionEntity.setDeepestFullyContainingNode(this);
            }

            redactionEntity.addIntersectingNode(this);
-            streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
+            streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getTextRange()))
                    .forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
        }
    }
@ -386,7 +402,8 @@ public interface SemanticNode {
     */
    default Stream<SemanticNode> streamAllSubNodes() {

-        return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
+        return getDocumentTree().allSubEntriesInOrder(getTreeId())
+                .map(DocumentTree.Entry::getNode);
    }


@ -397,7 +414,9 @@ public interface SemanticNode {
     */
    default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {

-        return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
+        return getDocumentTree().allSubEntriesInOrder(getTreeId())
+                .filter(entry -> entry.getType().equals(nodeType))
+                .map(DocumentTree.Entry::getNode);
    }


@ -406,9 +425,9 @@ public interface SemanticNode {
     *
     * @return Boundary of this Node's TextBlock
     */
-    default Boundary getBoundary() {
+    default TextRange getBoundary() {

-        return getTextBlock().getBoundary();
+        return getTextBlock().getTextRange();
    }


@ -454,8 +473,16 @@ public interface SemanticNode {
    private Map<Page, Rectangle2D> getBBoxFromChildren() {

        Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
-        List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
-        Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
+
+        List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren() //
+                .filter(SemanticNode::isNotOcrImage)
+                .map(SemanticNode::getBBox)
+                .toList();
+
+        Set<Page> pages = childrenBBoxes.stream()
+                .flatMap(map -> map.keySet()
+                        .stream())
+                .collect(Collectors.toSet());
        for (Page page : pages) {
            Rectangle2D bBoxOnPage = childrenBBoxes.stream()
                    .filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
@ -467,13 +494,24 @@ public interface SemanticNode {
    }


+    private static boolean isNotOcrImage(SemanticNode node) {
+
+        if (!node.getType().equals(NodeType.IMAGE)) {
+            return true;
+        }
+        return false;
+    }
+
+
    /**
     * @return The union of all BoundingBoxes of the TextBlock of this node
     */
    private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {

        Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
-        Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
+        Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
+                .stream()
+                .collect(Collectors.groupingBy(AtomicTextBlock::getPage));
        atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
        return bBoxPerPage;
    }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java
@ -10,10 +10,12 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;

 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
 import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -36,14 +38,14 @@ public class AtomicTextBlock implements TextBlock {
    Page page;

    //string coordinates
-    Boundary boundary;
+    TextRange textRange;
    String searchText;
    @Builder.Default
    List<Integer> lineBreaks = new ArrayList<>();
    @Builder.Default
-    List<Boundary> boldTextBoundaries = new ArrayList<>();
+    List<TextRange> boldTextBoundaries = new ArrayList<>();
    @Builder.Default
-    List<Boundary> italicTextBoundaries = new ArrayList<>();
+    List<TextRange> italicTextBoundaries = new ArrayList<>();
    String orientation;
    int textDirection;

@ -64,10 +66,44 @@ public class AtomicTextBlock implements TextBlock {
    }


+    @Override
+    public String subSequenceWithLineBreaks(TextRange stringTextRange) {
+
+        if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
+            return "";
+        }
+
+        Set<Integer> lbInBoundary = lineBreaks.stream()
+                .map(i -> i + stringTextRange.start())
+                .filter(stringTextRange::contains)
+                .collect(Collectors.toSet());
+        if (stringTextRange.end() == getTextRange().end()) {
+            lbInBoundary.add(getTextRange().end());
+        }
+        StringBuilder sb = new StringBuilder();
+        for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) {
+            char character = this.charAt(i);
+            if (lbInBoundary.contains(i + 1)) {
+                // always plus one, due to the linebreaks being an exclusive end index
+                if (!Character.isWhitespace(character)) {
+                    lbInBoundary.remove(i + 1);
+                    lbInBoundary.add(i + 2);
+                    sb.append(character);
+                    continue;
+                }
+                sb.append("\n");
+            } else {
+                sb.append(character);
+            }
+        }
+        return sb.toString();
+    }
+
+
    public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
                                                                 List<Integer> lineBreaks,
-                                                                 List<Boundary> boldTextBoundaries,
-                                                                 List<Boundary> italicTextBoundaries,
+                                                                 List<TextRange> boldTextBoundaries,
+                                                                 List<TextRange> italicTextBoundaries,
                                                                 List<Rectangle2D> positions,
                                                                 List<Integer> stringIdxToPositionIdx,
                                                                 long idx,
@ -89,7 +125,7 @@ public class AtomicTextBlock implements TextBlock {
                .italicTextBoundaries(italicTextBoundaries)
                .positions(positions)
                .stringIdxToPositionIdx(stringIdxToPositionIdx)
-                .boundary(new Boundary(offset, offset + searchText.length()))
+                .textRange(new TextRange(offset, offset + searchText.length()))
                .textDirection(textDirection)
                .orientation(orientation)
                .build();
@ -100,7 +136,7 @@ public class AtomicTextBlock implements TextBlock {

        return AtomicTextBlock.builder()
                .id(textBlockIdx)
-                .boundary(new Boundary(stringOffset, stringOffset))
+                .textRange(new TextRange(stringOffset, stringOffset))
                .searchText("")
                .page(page)
                .numberOnPage(numberOnPage)
@ -109,19 +145,18 @@ public class AtomicTextBlock implements TextBlock {
    }


-    public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
-                                                          DocumentPositionData documentPositionData,
-                                                          SemanticNode parent,
-                                                          Page page) {
+    public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {

        return AtomicTextBlock.builder()
                .id(documentTextData.getId())
                .numberOnPage(documentTextData.getNumberOnPage())
                .page(page)
-                .boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
+                .textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
                .searchText(documentTextData.getSearchText())
-                .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
-                .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
+                .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
+                                    .toList())
+                .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
+                                                .toList())
                .positions(toRectangle2DList(documentPositionData.getPositions()))
                .parent(parent)
                .build();
@ -130,7 +165,9 @@ public class AtomicTextBlock implements TextBlock {

    private static List<Rectangle2D> toRectangle2DList(float[][] positions) {

-        return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
+        return Arrays.stream(positions)
+                .map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
+                .toList();
    }


@ -140,11 +177,11 @@ public class AtomicTextBlock implements TextBlock {
            throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
        }
        if (lineNumber == 0) {
-            return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
+            return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
        } else if (lineNumber == numberOfLines() - 1) {
-            return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
+            return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
        }
-        return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
+        return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
    }


@ -159,9 +196,9 @@ public class AtomicTextBlock implements TextBlock {
    public int getNextLinebreak(int fromIndex) {

        return lineBreaks.stream()//
-                .filter(linebreak -> linebreak > fromIndex - boundary.start()) //
-                .findFirst() //
-                .orElse(searchText.length()) + boundary.start();
+                       .filter(linebreak -> linebreak > fromIndex - textRange.start()) //
+                       .findFirst() //
+                       .orElse(searchText.length()) + textRange.start();
    }


@ -169,43 +206,43 @@ public class AtomicTextBlock implements TextBlock {
    public int getPreviousLinebreak(int fromIndex) {

        return lineBreaks.stream()//
-                .filter(linebreak -> linebreak <= fromIndex - boundary.start())//
-                .reduce((a, b) -> b)//
-                .orElse(0) + boundary.start();
+                       .filter(linebreak -> linebreak <= fromIndex - textRange.start())//
+                       .reduce((a, b) -> b)//
+                       .orElse(0) + textRange.start();
    }


    @Override
    public Rectangle2D getPosition(int stringIdx) {

-        return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
+        return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
    }


    @Override
-    public List<Rectangle2D> getPositions(Boundary stringBoundary) {
+    public List<Rectangle2D> getPositions(TextRange stringTextRange) {

-        if (!containsBoundary(stringBoundary)) {
-            throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
+        if (!containsBoundary(stringTextRange)) {
+            throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
        }
-        if (stringBoundary.length() == 0) {
+        if (stringTextRange.length() == 0) {
            return Collections.emptyList();
        }

-        int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
+        int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());

-        if (stringBoundary.end() == this.boundary.end()) {
+        if (stringTextRange.end() == this.textRange.end()) {
            return positions.subList(startPositionIdx, positions.size());
        }

-        return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
+        return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));

    }


-    public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
+    public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {

-        List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
+        List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
                .stream()
                .map(this::getPositions)
                .map(RectangleTransformations::rectangleBBoxWithGaps)
@ -217,9 +254,12 @@ public class AtomicTextBlock implements TextBlock {
    }


-    private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
+    private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {

-        return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
+        return getLineBreaks().stream()
+                .map(linebreak -> linebreak + this.textRange.start())
+                .filter(textRange::contains)
+                .toList();
    }


--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java
@ -11,7 +11,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.stream.Stream;

-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;

 import lombok.AccessLevel;
@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock {

    List<AtomicTextBlock> atomicTextBlocks;
    String searchText;
-    Boundary boundary;
+    TextRange textRange;


    public static ConcatenatedTextBlock empty() {
@ -37,29 +37,30 @@ public class ConcatenatedTextBlock implements TextBlock {

        this.atomicTextBlocks = new LinkedList<>();
        if (atomicTextBlocks.isEmpty()) {
-            boundary = new Boundary(-1, -1);
+            textRange = new TextRange(-1, -1);
            return;
        }
        var firstTextBlock = atomicTextBlocks.get(0);
        this.atomicTextBlocks.add(firstTextBlock);
-        boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
+        textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());

-        atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
+        atomicTextBlocks.subList(1, atomicTextBlocks.size())
+                .forEach(this::concat);
    }


    public ConcatenatedTextBlock concat(TextBlock textBlock) {

-        int start = textBlock.getBoundary().start();
-        int end = textBlock.getBoundary().end();
+        int start = textBlock.getTextRange().start();
+        int end = textBlock.getTextRange().end();
        if (this.atomicTextBlocks.isEmpty()) {
-            boundary.setStart(start);
-            boundary.setEnd(end);
-        } else if (boundary.end() != start) {
-            throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
+            textRange.setStart(start);
+            textRange.setEnd(end);
+        } else if (textRange.end() != start) {
+            throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
        }
        this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
-        boundary.setEnd(end);
+        textRange.setEnd(end);
        this.searchText = null;
        return this;
    }
@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock {

    private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {

-        return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
+        return atomicTextBlocks.stream()
+                .filter(textBlock -> textBlock.getTextRange().contains(stringIdx))
+                .findAny()
+                .orElseThrow(IndexOutOfBoundsException::new);
    }


-    private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
+    private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {

-        return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
+        return atomicTextBlocks.stream()
+                .filter(tb -> tb.getTextRange().intersects(textRange))
+                .toList();
    }


@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock {
    @Override
    public int numberOfLines() {

-        return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
+        return atomicTextBlocks.stream()
+                .map(AtomicTextBlock::getLineBreaks)
+                .mapToInt(List::size).sum();
    }


@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock {
    @Override
    public List<Integer> getLineBreaks() {

-        return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
+        return getAtomicTextBlocks().stream()
+                .flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
+                        .stream())
+                .toList();
    }


@ -125,47 +136,48 @@ public class ConcatenatedTextBlock implements TextBlock {


    @Override
-    public List<Rectangle2D> getPositions(Boundary stringBoundary) {
+    public List<Rectangle2D> getPositions(TextRange stringTextRange) {

-        List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
+        List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);

        if (textBlocks.size() == 1) {
-            return textBlocks.get(0).getPositions(stringBoundary);
+            return textBlocks.get(0).getPositions(stringTextRange);
        }

        AtomicTextBlock firstTextBlock = textBlocks.get(0);
-        List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
+        List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));

        for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
            positions.addAll(textBlock.getPositions());
        }

        var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
-        positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
+        positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));

        return positions;
    }


    @Override
-    public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
+    public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {

-        List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
+        List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);

        if (textBlocks.size() == 1) {
-            return textBlocks.get(0).getPositionsPerPage(stringBoundary);
+            return textBlocks.get(0).getPositionsPerPage(stringTextRange);
        }

        AtomicTextBlock firstTextBlock = textBlocks.get(0);
-        Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
+        Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));

        for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
-            rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
+            rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
        }

        AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
        rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
-                lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
+                                                                        lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
+                                                                                                                        stringTextRange.end())));

        return rectanglesPerLinePerPage;
    }
@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock {
    private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {

        Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
-        map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
+        map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
+                                                               rectangles,
+                                                               (l1, l2) -> Stream.concat(l1.stream(), l2.stream())
+                                                                       .toList()));
        return mergedMap;
    }


+    @Override
+    public String subSequenceWithLineBreaks(TextRange stringTextRange) {
+
+        if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
+            return "";
+        }
+
+        List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
+
+        if (textBlocks.size() == 1) {
+            return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange);
+        }
+
+        StringBuilder sb = new StringBuilder();
+        AtomicTextBlock firstTextBlock = textBlocks.get(0);
+        sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
+
+        for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
+            sb.append(textBlock.searchTextWithLineBreaks());
+        }
+
+        var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
+        sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
+
+        return sb.toString();
+    }
+
+
    @Override
    public String toString() {

@ -187,16 +230,22 @@ public class ConcatenatedTextBlock implements TextBlock {


    @Override
-    public List<Boundary> getBoldTextBoundaries() {
+    public List<TextRange> getBoldTextBoundaries() {

-        return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
+        return getAtomicTextBlocks().stream()
+                .map(AtomicTextBlock::getBoldTextBoundaries)
+                .flatMap(Collection::stream)
+                .toList();
    }


    @Override
-    public List<Boundary> getItalicTextBoundaries() {
+    public List<TextRange> getItalicTextBoundaries() {

-        return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
+        return getAtomicTextBlocks().stream()
+                .map(AtomicTextBlock::getItalicTextBoundaries)
+                .flatMap(Collection::stream)
+                .toList();
    }


--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java
@ -10,7 +10,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;

-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;

 public interface TextBlock extends CharSequence {
@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence {
    List<AtomicTextBlock> getAtomicTextBlocks();


-    List<Boundary> getBoldTextBoundaries();
+    List<TextRange> getBoldTextBoundaries();


-    List<Boundary> getItalicTextBoundaries();
+    List<TextRange> getItalicTextBoundaries();


    String getOrientation();
@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence {
    int getTextDirection();


-    Boundary getBoundary();
+    TextRange getTextRange();


    int getNextLinebreak(int fromIndex);
@ -48,31 +48,41 @@ public interface TextBlock extends CharSequence {
    Rectangle2D getPosition(int stringIdx);


-    List<Rectangle2D> getPositions(Boundary stringBoundary);
+    List<Rectangle2D> getPositions(TextRange stringTextRange);


-    Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
+    Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);


    int numberOfLines();


+    String subSequenceWithLineBreaks(TextRange stringTextRange);
+
+
+    default String searchTextWithLineBreaks() {
+
+        return subSequenceWithLineBreaks(getTextRange());
+    }
+
    default int indexOf(String searchTerm) {

-        return indexOf(searchTerm, getBoundary().start());
+        return indexOf(searchTerm, getTextRange().start());
    }


    default Set<Page> getPages() {

-        return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
+        return getAtomicTextBlocks().stream()
+                .map(AtomicTextBlock::getPage)
+                .collect(Collectors.toUnmodifiableSet());
    }


-    default Set<Page> getPages(Boundary boundary) {
+    default Set<Page> getPages(TextRange textRange) {

        return getAtomicTextBlocks().stream()
-                .filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
+                .filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
                .map(AtomicTextBlock::getPage)
                .collect(Collectors.toUnmodifiableSet());
    }
@ -80,38 +90,38 @@ public interface TextBlock extends CharSequence {

    default int indexOf(String searchTerm, int startOffset) {

-        int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
+        int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
        if (start == -1) {
            return -1;
        }
-        return start + getBoundary().start();
+        return start + getTextRange().start();
    }


    default CharSequence getFirstLine() {

-        return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
+        return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
    }


-    default boolean containsBoundary(Boundary boundary) {
+    default boolean containsBoundary(TextRange textRange) {

-        if (boundary.end() < boundary.start()) {
-            throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
+        if (textRange.end() < textRange.start()) {
+            throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
        }
-        return getBoundary().contains(boundary);
+        return getTextRange().contains(textRange);
    }


    default boolean containsIndex(int stringIndex) {

-        return getBoundary().contains(stringIndex);
+        return getTextRange().contains(stringIndex);
    }


-    default CharSequence subSequence(Boundary boundary) {
+    default CharSequence subSequence(TextRange textRange) {

-        return subSequence(boundary.start(), boundary.end());
+        return subSequence(textRange.start(), textRange.end());
    }


@ -128,21 +138,21 @@ public interface TextBlock extends CharSequence {
    @Override
    default CharSequence subSequence(int start, int end) {

-        return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
+        return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
    }


    @Override
    default int length() {

-        return getBoundary().length();
+        return getTextRange().length();
    }


    @Override
    default char charAt(int index) {

-        return getSearchText().charAt(index - getBoundary().start());
+        return getSearchText().charAt(index - getTextRange().start());
    }

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
 import java.util.Collections;
 import java.util.List;

-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;

 import lombok.AccessLevel;
 import lombok.Builder;
@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto {
    String searchText;
    List<Integer> lineBreaks;
    List<Integer> stringIdxToPositionIdx;
-    List<Boundary> boldTextBoundaries;
-    List<Boundary> italicTextBoundaries;
+    List<TextRange> boldTextBoundaries;
+    List<TextRange> italicTextBoundaries;
    List<Rectangle2D> positions;


--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java
@ -9,7 +9,7 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Objects;

-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory {
    }


-    private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
+    private static List<TextRange> mergeToBoundaries(List<Integer> integers) {

        if (integers.isEmpty()) {
            return Collections.emptyList();
        }
-        List<Boundary> boundaries = new LinkedList<>();
+        List<TextRange> boundaries = new LinkedList<>();
        int start = integers.get(0);
        int end = integers.get(0) + 1;
        for (int current : integers) {
            if (current > end + 1) {
-                boundaries.add(new Boundary(start, end));
+                boundaries.add(new TextRange(start, end));
                start = current;
            }
            end = current + 1;
        }
        if (boundaries.isEmpty()) {
-            boundaries.add(new Boundary(start, end));
+            boundaries.add(new TextRange(start, end));
        }
        return boundaries;
    }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java
@ -116,8 +116,8 @@ public class DocumentDataMapper {
                .page(atomicTextBlock.getPage().getNumber().longValue())
                .searchText(atomicTextBlock.getSearchText())
                .numberOnPage(atomicTextBlock.getNumberOnPage())
-                .start(atomicTextBlock.getBoundary().start())
-                .end(atomicTextBlock.getBoundary().end())
+                .start(atomicTextBlock.getTextRange().start())
+                .end(atomicTextBlock.getTextRange().end())
                .lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
                .build();
    }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java
@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
 import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
@ -82,15 +82,15 @@ public class TaasDocumentDataMapper {
    }


-    private static Range toRange(Boundary boundary) {
+    private static Range toRange(TextRange textRange) {

-        return new Range(boundary.start(), boundary.end());
+        return new Range(textRange.start(), textRange.end());
    }


-    private static List<Range> toRange(List<Boundary> boundary) {
+    private static List<Range> toRange(List<TextRange> textRange) {

-        return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
+        return textRange.stream().map(TaasDocumentDataMapper::toRange).toList();
    }


--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TokenCounter.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TokenCounter.java
@ -0,0 +1,30 @@
+package com.knecon.fforesight.service.layoutparser.processor.utils;
+
+import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer;
+
+import org.commonmark.node.Node;
+import org.commonmark.renderer.markdown.MarkdownRenderer;
+
+import com.didalgo.gpt3.Encoding;
+import com.didalgo.gpt3.GPT3Tokenizer;
+import com.didalgo.gpt3.TokenCount;
+
+public class TokenCounter {
+
+    private static final GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE);
+
+
+    public static int countTokens(Node node) {
+
+        MarkdownRenderer renderer = buildRenderer();
+        String markdownResult = renderer.render(node);
+        return countTokens(markdownResult);
+    }
+
+
+    public static synchronized int countTokens(String text) {
+
+        return TokenCount.fromString(text, tokenizer);
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java
+++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java
@ -10,6 +10,7 @@ import org.springframework.amqp.core.Message;
 import org.springframework.amqp.rabbit.annotation.RabbitHandler;
 import org.springframework.amqp.rabbit.annotation.RabbitListener;
 import org.springframework.amqp.rabbit.core.RabbitTemplate;
+import org.springframework.boot.actuate.logging.LogFileWebEndpoint;
 import org.springframework.stereotype.Service;

 import com.fasterxml.jackson.databind.ObjectMapper;
@ -18,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
 import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
+import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline;

 import lombok.RequiredArgsConstructor;
 import lombok.SneakyThrows;
@ -29,6 +31,7 @@ import lombok.extern.slf4j.Slf4j;
 public class MessageHandler {

    private final LayoutParsingPipeline layoutParsingPipeline;
+    private final MarkdownParsingPipeline markdownParsingPipeline;
    private final ObjectMapper objectMapper;
    private final RabbitTemplate rabbitTemplate;
    private final static String X_PIPELINE_PREFIX = "X-PIPE-";
@ -41,30 +44,30 @@ public class MessageHandler {

        LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);

-        if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) {
-            throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!");
-        }
        log.info("Layout parsing request received {}", layoutParsingRequest.identifier());
        if (message.getMessageProperties().isRedelivered()) {
            throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
-                    layoutParsingRequest.identifier()));
+                                                                      layoutParsingRequest.identifier()));
+        }
+        LayoutParsingFinishedEvent layoutParsingFinishedEvent;
+        if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.MARKDOWN)) {
+            layoutParsingFinishedEvent = markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest);
+        } else {
+            layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
        }
-        LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
        sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent, message);
    }


    public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent, Message message) {

-        Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info);
+        Arrays.stream(layoutParsingFinishedEvent.message().split("\n"))
+                .forEach(log::info);
        rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent, m -> {
-            var forwardHeaders = message.getMessageProperties()
-                    .getHeaders()
-                    .entrySet()
+            var forwardHeaders = message.getMessageProperties().getHeaders().entrySet()
                    .stream()
                    .filter(e -> e.getKey().toUpperCase(Locale.ROOT).startsWith(X_PIPELINE_PREFIX))
-                    .collect(Collectors.toMap(Map.Entry::getKey,
-                            Map.Entry::getValue));
+                    .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
            m.getMessageProperties().getHeaders().putAll(forwardHeaders);
            return m;
        });
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
    @Test
    public void testLayoutParserEndToEnd() {

-        String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
+        String filePath = "files/new/crafted document.pdf";

        runForFile(filePath);
    }
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
    @SneakyThrows
    public void testLayoutParserEndToEndWithFolder() {

-        String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
+        String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-single-digit-headlines";
        List<Path> pdfFiles = Files.walk(Path.of(folder))
                .filter(path -> path.getFileName().toString().endsWith(".pdf"))
                .sorted(Comparator.comparing(Path::getFileName))
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
            file = new File(filePath);
        }

-        LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
+        LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE_OLD, true);
        prepareStorage(layoutParsingRequest, file);

        LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
@ -79,9 +79,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
                .forEach(log::info);

        File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
+        File markdownTmpFile = new File("/tmp/layoutparserEND2END/" + fileName + ".md");
        assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();

        storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
+        storageService.downloadTo(TENANT_ID, layoutParsingRequest.markdownDocumentStorageId(), markdownTmpFile);
    }


--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/MarkdownParsingPipelineTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/MarkdownParsingPipelineTest.java
@ -0,0 +1,54 @@
+package com.knecon.fforesight.service.layoutparser.server;
+
+import java.io.FileInputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.iqser.red.commons.jackson.ObjectMapperFactory;
+import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
+import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
+import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
+import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline;
+import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
+
+import lombok.AccessLevel;
+import lombok.SneakyThrows;
+import lombok.experimental.FieldDefaults;
+
+@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
+public class MarkdownParsingPipelineTest {
+
+    static String TENANT = "tenant";
+    ObjectMapper mapper = ObjectMapperFactory.create();
+    FileSystemBackedStorageService storageService = new FileSystemBackedStorageService(mapper);
+    MarkdownParsingPipeline markdownParsingPipeline = new MarkdownParsingPipeline(storageService);
+
+
+    @Test
+    @SneakyThrows
+    public void parseMarkdownsFromFolder() {
+
+        Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/confluence_dump/");
+        Files.walk(file)
+                .filter(path -> path.getFileName().toFile().toString().endsWith(".md"))
+                .peek(System.out::println)
+                .forEach(this::parseMarkdown);
+    }
+
+
+    @SneakyThrows
+    public void parseMarkdown(Path file) {
+
+        LayoutParsingRequest layoutParsingRequest = AbstractTest.buildDefaultLayoutParsingRequest(file.getFileName().toFile().toString(), LayoutParsingType.MARKDOWN, true);
+
+        try (var in = new FileInputStream(file.toFile())) {
+            storageService.storeObject(TENANT, layoutParsingRequest.originFileStorageId(), in);
+        }
+
+        markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest);
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java
@ -1,71 +0,0 @@
-package com.knecon.fforesight.service.layoutparser.server.graph;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.util.Collections;
-import java.util.List;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
-
-class BoundaryTest {
-
-    Boundary startBoundary;
-
-
-    @BeforeEach
-    void setUp() {
-
-        startBoundary = new Boundary(10, 100);
-    }
-
-
-    @Test
-    void testContains() {
-
-        assertTrue(startBoundary.contains(11));
-        assertTrue(startBoundary.contains(50));
-        assertFalse(startBoundary.contains(9));
-        assertFalse(startBoundary.contains(100));
-        assertFalse(startBoundary.contains(150));
-        assertFalse(startBoundary.contains(-123));
-        assertTrue(startBoundary.contains(new Boundary(11, 99)));
-        assertTrue(startBoundary.contains(new Boundary(10, 100)));
-        assertTrue(startBoundary.contains(new Boundary(11, 11)));
-        assertFalse(startBoundary.contains(9, 100));
-        assertTrue(startBoundary.contains(100, 100));
-        assertFalse(startBoundary.contains(100, 101));
-        assertFalse(startBoundary.contains(150, 151));
-    }
-
-
-    @Test
-    void testIntersects() {
-
-        assertTrue(startBoundary.intersects(new Boundary(1, 11)));
-        assertTrue(startBoundary.intersects(new Boundary(11, 12)));
-        assertTrue(startBoundary.intersects(new Boundary(11, 100)));
-        assertFalse(startBoundary.intersects(new Boundary(100, 101)));
-        assertTrue(startBoundary.intersects(new Boundary(99, 101)));
-    }
-
-
-    @Test
-    void testSplit() {
-
-        assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size());
-        assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90)));
-        assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40)));
-        assertEquals(1, startBoundary.split(Collections.emptyList()).size());
-        assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size());
-        assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0)));
-        assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100)));
-        assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100)));
-    }
-
-}
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java
@ -0,0 +1,71 @@
+package com.knecon.fforesight.service.layoutparser.server.graph;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
+
+class TextRangeTest {
+
+    TextRange startTextRange;
+
+
+    @BeforeEach
+    void setUp() {
+
+        startTextRange = new TextRange(10, 100);
+    }
+
+
+    @Test
+    void testContains() {
+
+        assertTrue(startTextRange.contains(11));
+        assertTrue(startTextRange.contains(50));
+        assertFalse(startTextRange.contains(9));
+        assertFalse(startTextRange.contains(100));
+        assertFalse(startTextRange.contains(150));
+        assertFalse(startTextRange.contains(-123));
+        assertTrue(startTextRange.contains(new TextRange(11, 99)));
+        assertTrue(startTextRange.contains(new TextRange(10, 100)));
+        assertTrue(startTextRange.contains(new TextRange(11, 11)));
+        assertFalse(startTextRange.contains(9, 100));
+        assertTrue(startTextRange.contains(100, 100));
+        assertFalse(startTextRange.contains(100, 101));
+        assertFalse(startTextRange.contains(150, 151));
+    }
+
+
+    @Test
+    void testIntersects() {
+
+        assertTrue(startTextRange.intersects(new TextRange(1, 11)));
+        assertTrue(startTextRange.intersects(new TextRange(11, 12)));
+        assertTrue(startTextRange.intersects(new TextRange(11, 100)));
+        assertFalse(startTextRange.intersects(new TextRange(100, 101)));
+        assertTrue(startTextRange.intersects(new TextRange(99, 101)));
+    }
+
+
+    @Test
+    void testSplit() {
+
+        assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size());
+        assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90)));
+        assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
+        assertEquals(1, startTextRange.split(Collections.emptyList()).size());
+        assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
+        assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
+        assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
+        assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java
@ -41,6 +41,7 @@ import lombok.SneakyThrows;
@Import(AbstractTest.TestConfiguration.class)
 public abstract class AbstractTest {

+    public static final String MARKDOWN_FILE_ID = "markdown";
    @Autowired
    protected LayoutParsingStorageService layoutParsingStorageService;

@ -105,7 +106,7 @@ public abstract class AbstractTest {
    }


-    protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
+    public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {

        var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
        return LayoutParsingRequest.builder()
@ -121,6 +122,7 @@ public abstract class AbstractTest {
                .pageFileStorageId(fileName + PAGES_FILE_ID)
                .simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
                .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
+                .markdownDocumentStorageId(fileName + MARKDOWN_FILE_ID)
                .build();
    }

--- a/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf
+++ b/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf
--- a/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf
+++ b/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf
--- a/publish-custom-image.sh
+++ b/publish-custom-image.sh
@ -1,5 +1,9 @@
 #!/bin/bash
+
+set -e
+
 dir=${PWD##*/}
+
 gradle assemble

 # Get the current Git branch
@ -11,5 +15,32 @@ commit_hash=$(git rev-parse --short=5 HEAD)
 # Combine branch and commit hash
 buildName="${USER}-${branch}-${commit_hash}"

-gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
-echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"
+gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
+
+newImageName="nexus.knecon.com:5001/ff/layoutparser-service-server:${buildName}"
+
+echo "full image name:"
+echo ${newImageName}
+echo ""
+
+if [ -z "$1" ]; then
+    exit 0
+fi
+
+namespace=${1}
+deployment_name="layoutparser-service"
+
+echo "deploying to ${namespace}"
+
+oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
+
+if [ "${newImageName}" = "${oldImageName}" ]; then
+    echo "Image tag of ${deployment_name} did not change, redeploying..."
+    rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
+else
+    echo "upgrading the image tag of ${deployment_name}..."
+    rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
+fi
+
+rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
+echo "Deployed ${deployment_name}:${buildName} to ${namespace}"
Author	SHA1	Message	Date
Kilian Schuettler	7bb2293915	CLARI-002: fix some stuff with DocumentDataParser * still todo, exlude semanticNodes inside TableCells	2024-07-10 19:48:42 +02:00
Kilian Schuettler	3a57d26e97	Clari-002: markdown parser for documentData	2024-07-09 13:45:44 +02:00
Kilian Schuettler	e3819349cf	Clari-002: render document data as markdown	2024-07-09 11:02:28 +02:00
Kilian Schuettler	e68869495a	Clari-002: render document data as markdown	2024-07-08 13:38:40 +02:00
Kilian Schuettler	4fb0de82ec	CLARI-002: markdown chunking prototype	2024-06-24 17:51:05 +02:00