Clari-002: render document data as markdown

This commit is contained in:
Kilian Schuettler 2024-07-08 15:08:32 +02:00
parent 251c84f884
commit 9864d81d9d
2 changed files with 61 additions and 22 deletions

View File

@ -4,13 +4,16 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Stream;
import org.commonmark.ext.gfm.tables.TableBlock;
import org.commonmark.node.Block;
import org.commonmark.node.CustomBlock;
import org.commonmark.ext.gfm.tables.TableBody;
import org.commonmark.ext.gfm.tables.TableCell;
import org.commonmark.ext.gfm.tables.TableHead;
import org.commonmark.ext.gfm.tables.TableRow;
import org.commonmark.node.Document;
import org.commonmark.node.Emphasis;
import org.commonmark.node.Heading;
@ -25,55 +28,91 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@UtilityClass
public class DocumentDataParser {
Document document;
public Document parse(Stream<SemanticNode> semanticNodes) {
semanticNodes.forEach(this::parseNode);
Document document = new Document();
semanticNodes.map(DocumentDataParser::parseNode)
.filter(Objects::nonNull)
.forEach(document::appendChild);
return document;
}
private void parseNode(SemanticNode semanticNode) {
private Node parseNode(SemanticNode semanticNode) {
switch (semanticNode.getType()) {
return switch (semanticNode.getType()) {
case HEADLINE -> parseHeadline((Headline) semanticNode);
case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
case TABLE -> parseTable((Table) semanticNode);
default -> null;
};
}
private TableBlock parseTable(Table table) {
TableBlock tableNode = new TableBlock();
TableHead head = new TableHead();
TableRow tableRow = createTableRow(table, 0);
head.appendChild(tableRow);
int row = 1;
for (; row < table.getNumberOfRows() && table.streamRow(row)
.allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) {
head.appendChild(createTableRow(table, row));
}
tableNode.appendChild(head);
TableBody tableBody = new TableBody();
for (; row < table.getNumberOfRows(); row++) {
tableBody.appendChild(createTableRow(table, row));
}
tableNode.appendChild(tableBody);
return tableNode;
}
private void parseTable(Table table) {
private TableRow createTableRow(Table table, int row) {
CustomBlock tableNode = new TableBlock();
document.appendChild(tableNode);
TableRow tableRow = new TableRow();
table.streamRow(row)
.map(DocumentDataParser::createTableCell)
.forEach(tableRow::appendChild);
return tableRow;
}
private void parseParagraph(Paragraph paragraph) {
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
var cell = new TableCell();
if (tc.isLeaf()) {
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
} else {
tc.streamChildren()
.map(DocumentDataParser::parseNode)
.filter(Objects::nonNull)
.forEach(cell::appendChild);
}
return cell;
}
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
document.appendChild(heading);
return heading;
}
private void parseHeadline(Headline headline) {
private Heading parseHeadline(Headline headline) {
Heading heading = new Heading();
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
document.appendChild(heading);
return heading;
}

View File

@ -105,7 +105,7 @@ public abstract class AbstractTest {
}
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
return LayoutParsingRequest.builder()