From 9864d81d9ddb9e5d8a803ea083d421a8e9664952 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 8 Jul 2024 15:08:32 +0200 Subject: [PATCH] Clari-002: render document data as markdown --- .../markdown/DocumentDataParser.java | 81 ++++++++++++++----- .../server/utils/AbstractTest.java | 2 +- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java index 60645dd..750cdec 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java @@ -4,13 +4,16 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.TreeMap; import java.util.stream.Stream; import org.commonmark.ext.gfm.tables.TableBlock; -import org.commonmark.node.Block; -import org.commonmark.node.CustomBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.ext.gfm.tables.TableCell; +import org.commonmark.ext.gfm.tables.TableHead; +import org.commonmark.ext.gfm.tables.TableRow; import org.commonmark.node.Document; import org.commonmark.node.Emphasis; import org.commonmark.node.Heading; @@ -25,55 +28,91 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import lombok.AccessLevel; -import lombok.RequiredArgsConstructor; -import lombok.experimental.FieldDefaults; +import lombok.experimental.UtilityClass; -@RequiredArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +@UtilityClass public class DocumentDataParser { - Document document; - - public Document parse(Stream semanticNodes) { - semanticNodes.forEach(this::parseNode); + Document document = new Document(); + semanticNodes.map(DocumentDataParser::parseNode) + .filter(Objects::nonNull) + .forEach(document::appendChild); return document; } - private void parseNode(SemanticNode semanticNode) { + private Node parseNode(SemanticNode semanticNode) { - switch (semanticNode.getType()) { + return switch (semanticNode.getType()) { case HEADLINE -> parseHeadline((Headline) semanticNode); case PARAGRAPH -> parseParagraph((Paragraph) semanticNode); case TABLE -> parseTable((Table) semanticNode); + default -> null; + }; + } + + + private TableBlock parseTable(Table table) { + + TableBlock tableNode = new TableBlock(); + TableHead head = new TableHead(); + TableRow tableRow = createTableRow(table, 0); + head.appendChild(tableRow); + int row = 1; + for (; row < table.getNumberOfRows() && table.streamRow(row) + .allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) { + head.appendChild(createTableRow(table, row)); } + tableNode.appendChild(head); + TableBody tableBody = new TableBody(); + for (; row < table.getNumberOfRows(); row++) { + tableBody.appendChild(createTableRow(table, row)); + } + tableNode.appendChild(tableBody); + return tableNode; } - private void parseTable(Table table) { + private TableRow createTableRow(Table table, int row) { - CustomBlock tableNode = new TableBlock(); - - document.appendChild(tableNode); + TableRow tableRow = new TableRow(); + table.streamRow(row) + .map(DocumentDataParser::createTableCell) + .forEach(tableRow::appendChild); + return tableRow; } - private void parseParagraph(Paragraph paragraph) { + private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) { + + var cell = new TableCell(); + if (tc.isLeaf()) { + parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild); + } else { + tc.streamChildren() + .map(DocumentDataParser::parseNode) + .filter(Objects::nonNull) + .forEach(cell::appendChild); + } + return cell; + } + + + private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) { org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph(); parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild); - document.appendChild(heading); + return heading; } - private void parseHeadline(Headline headline) { + private Heading parseHeadline(Headline headline) { Heading heading = new Heading(); parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild); - document.appendChild(heading); + return heading; } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index 199f918..cbd7b6d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -105,7 +105,7 @@ public abstract class AbstractTest { } - protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { + public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName); return LayoutParsingRequest.builder()