Clari-002: render document data as markdown
This commit is contained in:
parent
251c84f884
commit
9864d81d9d
@ -4,13 +4,16 @@ import java.util.ArrayList;
|
|||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.commonmark.ext.gfm.tables.TableBlock;
|
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||||
import org.commonmark.node.Block;
|
import org.commonmark.ext.gfm.tables.TableBody;
|
||||||
import org.commonmark.node.CustomBlock;
|
import org.commonmark.ext.gfm.tables.TableCell;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableHead;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableRow;
|
||||||
import org.commonmark.node.Document;
|
import org.commonmark.node.Document;
|
||||||
import org.commonmark.node.Emphasis;
|
import org.commonmark.node.Emphasis;
|
||||||
import org.commonmark.node.Heading;
|
import org.commonmark.node.Heading;
|
||||||
@ -25,55 +28,91 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Se
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.experimental.UtilityClass;
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@RequiredArgsConstructor
|
@UtilityClass
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public class DocumentDataParser {
|
public class DocumentDataParser {
|
||||||
|
|
||||||
Document document;
|
|
||||||
|
|
||||||
|
|
||||||
public Document parse(Stream<SemanticNode> semanticNodes) {
|
public Document parse(Stream<SemanticNode> semanticNodes) {
|
||||||
|
|
||||||
semanticNodes.forEach(this::parseNode);
|
Document document = new Document();
|
||||||
|
semanticNodes.map(DocumentDataParser::parseNode)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.forEach(document::appendChild);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void parseNode(SemanticNode semanticNode) {
|
private Node parseNode(SemanticNode semanticNode) {
|
||||||
|
|
||||||
switch (semanticNode.getType()) {
|
return switch (semanticNode.getType()) {
|
||||||
case HEADLINE -> parseHeadline((Headline) semanticNode);
|
case HEADLINE -> parseHeadline((Headline) semanticNode);
|
||||||
case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
|
case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
|
||||||
case TABLE -> parseTable((Table) semanticNode);
|
case TABLE -> parseTable((Table) semanticNode);
|
||||||
|
default -> null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TableBlock parseTable(Table table) {
|
||||||
|
|
||||||
|
TableBlock tableNode = new TableBlock();
|
||||||
|
TableHead head = new TableHead();
|
||||||
|
TableRow tableRow = createTableRow(table, 0);
|
||||||
|
head.appendChild(tableRow);
|
||||||
|
int row = 1;
|
||||||
|
for (; row < table.getNumberOfRows() && table.streamRow(row)
|
||||||
|
.allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) {
|
||||||
|
head.appendChild(createTableRow(table, row));
|
||||||
}
|
}
|
||||||
|
tableNode.appendChild(head);
|
||||||
|
TableBody tableBody = new TableBody();
|
||||||
|
for (; row < table.getNumberOfRows(); row++) {
|
||||||
|
tableBody.appendChild(createTableRow(table, row));
|
||||||
|
}
|
||||||
|
tableNode.appendChild(tableBody);
|
||||||
|
return tableNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void parseTable(Table table) {
|
private TableRow createTableRow(Table table, int row) {
|
||||||
|
|
||||||
CustomBlock tableNode = new TableBlock();
|
TableRow tableRow = new TableRow();
|
||||||
|
table.streamRow(row)
|
||||||
document.appendChild(tableNode);
|
.map(DocumentDataParser::createTableCell)
|
||||||
|
.forEach(tableRow::appendChild);
|
||||||
|
return tableRow;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void parseParagraph(Paragraph paragraph) {
|
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
|
||||||
|
|
||||||
|
var cell = new TableCell();
|
||||||
|
if (tc.isLeaf()) {
|
||||||
|
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
|
||||||
|
} else {
|
||||||
|
tc.streamChildren()
|
||||||
|
.map(DocumentDataParser::parseNode)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.forEach(cell::appendChild);
|
||||||
|
}
|
||||||
|
return cell;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
|
||||||
|
|
||||||
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
|
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
|
||||||
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
|
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
|
||||||
document.appendChild(heading);
|
return heading;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void parseHeadline(Headline headline) {
|
private Heading parseHeadline(Headline headline) {
|
||||||
|
|
||||||
Heading heading = new Heading();
|
Heading heading = new Heading();
|
||||||
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
|
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
|
||||||
document.appendChild(heading);
|
return heading;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -105,7 +105,7 @@ public abstract class AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||||
|
|
||||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||||
return LayoutParsingRequest.builder()
|
return LayoutParsingRequest.builder()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user