Clari-002: render document data as markdown
This commit is contained in:
parent
251c84f884
commit
9864d81d9d
@ -4,13 +4,16 @@ import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||
import org.commonmark.node.Block;
|
||||
import org.commonmark.node.CustomBlock;
|
||||
import org.commonmark.ext.gfm.tables.TableBody;
|
||||
import org.commonmark.ext.gfm.tables.TableCell;
|
||||
import org.commonmark.ext.gfm.tables.TableHead;
|
||||
import org.commonmark.ext.gfm.tables.TableRow;
|
||||
import org.commonmark.node.Document;
|
||||
import org.commonmark.node.Emphasis;
|
||||
import org.commonmark.node.Heading;
|
||||
@ -25,55 +28,91 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@UtilityClass
|
||||
public class DocumentDataParser {
|
||||
|
||||
Document document;
|
||||
|
||||
|
||||
public Document parse(Stream<SemanticNode> semanticNodes) {
|
||||
|
||||
semanticNodes.forEach(this::parseNode);
|
||||
Document document = new Document();
|
||||
semanticNodes.map(DocumentDataParser::parseNode)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(document::appendChild);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private void parseNode(SemanticNode semanticNode) {
|
||||
private Node parseNode(SemanticNode semanticNode) {
|
||||
|
||||
switch (semanticNode.getType()) {
|
||||
return switch (semanticNode.getType()) {
|
||||
case HEADLINE -> parseHeadline((Headline) semanticNode);
|
||||
case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
|
||||
case TABLE -> parseTable((Table) semanticNode);
|
||||
default -> null;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private TableBlock parseTable(Table table) {
|
||||
|
||||
TableBlock tableNode = new TableBlock();
|
||||
TableHead head = new TableHead();
|
||||
TableRow tableRow = createTableRow(table, 0);
|
||||
head.appendChild(tableRow);
|
||||
int row = 1;
|
||||
for (; row < table.getNumberOfRows() && table.streamRow(row)
|
||||
.allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) {
|
||||
head.appendChild(createTableRow(table, row));
|
||||
}
|
||||
tableNode.appendChild(head);
|
||||
TableBody tableBody = new TableBody();
|
||||
for (; row < table.getNumberOfRows(); row++) {
|
||||
tableBody.appendChild(createTableRow(table, row));
|
||||
}
|
||||
tableNode.appendChild(tableBody);
|
||||
return tableNode;
|
||||
}
|
||||
|
||||
|
||||
private void parseTable(Table table) {
|
||||
private TableRow createTableRow(Table table, int row) {
|
||||
|
||||
CustomBlock tableNode = new TableBlock();
|
||||
|
||||
document.appendChild(tableNode);
|
||||
TableRow tableRow = new TableRow();
|
||||
table.streamRow(row)
|
||||
.map(DocumentDataParser::createTableCell)
|
||||
.forEach(tableRow::appendChild);
|
||||
return tableRow;
|
||||
}
|
||||
|
||||
|
||||
private void parseParagraph(Paragraph paragraph) {
|
||||
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
|
||||
|
||||
var cell = new TableCell();
|
||||
if (tc.isLeaf()) {
|
||||
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
|
||||
} else {
|
||||
tc.streamChildren()
|
||||
.map(DocumentDataParser::parseNode)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(cell::appendChild);
|
||||
}
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
|
||||
|
||||
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
|
||||
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
|
||||
document.appendChild(heading);
|
||||
return heading;
|
||||
}
|
||||
|
||||
|
||||
private void parseHeadline(Headline headline) {
|
||||
private Heading parseHeadline(Headline headline) {
|
||||
|
||||
Heading heading = new Heading();
|
||||
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
|
||||
document.appendChild(heading);
|
||||
return heading;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -105,7 +105,7 @@ public abstract class AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
|
||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||
return LayoutParsingRequest.builder()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user