Merge branch 'document-data-markdown' into 'main'

CLARI: document-data-markdown

See merge request fforesight/layout-parser!181
This commit is contained in:
Kilian Schüttler 2024-07-18 17:19:44 +02:00
commit 370165dc59
29 changed files with 764 additions and 45 deletions

View File

@ -19,7 +19,6 @@ public record LayoutParsingRequest(
@Schema(description = "Path to the original PDF file.")//
@NonNull String originFileStorageId,//
@Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,//
@Schema(description = "Optional Path to the image classification file.")//
@ -37,9 +36,12 @@ public record LayoutParsingRequest(
@NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,//
@Schema(description = "Path where the Document Markdown File will be stored.")//
Optional<String> documentMarkdownFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId) {
@NonNull String viewerDocumentStorageId
) {
}

View File

@ -26,4 +26,6 @@ dependencies {
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
}

View File

@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
@ -146,6 +147,9 @@ public class LayoutParsingPipeline {
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
if(layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);

View File

@ -1,9 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@ -89,7 +91,7 @@ public class LayoutParsingStorageService {
}
@SneakyThrows
@SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
@ -165,4 +167,16 @@ public class LayoutParsingStorageService {
}
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
}
}
}

View File

@ -0,0 +1,331 @@
package com.knecon.fforesight.service.layoutparser.processor.markdown;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.commonmark.Extension;
import org.commonmark.ext.gfm.tables.TableBlock;
import org.commonmark.ext.gfm.tables.TableBody;
import org.commonmark.ext.gfm.tables.TableCell;
import org.commonmark.ext.gfm.tables.TableHead;
import org.commonmark.ext.gfm.tables.TableRow;
import org.commonmark.ext.gfm.tables.TablesExtension;
import org.commonmark.node.Block;
import org.commonmark.node.Document;
import org.commonmark.node.Emphasis;
import org.commonmark.node.HardLineBreak;
import org.commonmark.node.Heading;
import org.commonmark.node.Node;
import org.commonmark.node.Paragraph;
import org.commonmark.node.StrongEmphasis;
import org.commonmark.node.Text;
import org.commonmark.renderer.markdown.MarkdownRenderer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.AbstractNodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
public class MarkdownMapper extends AbstractNodeVisitor {
Document markdownDocument = new Document();
public String toMarkdownContent(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document document) {
visit(document);
return buildRenderer().render(this.markdownDocument);
}
@Override
public void visit(Headline headline) {
markdownDocument.appendChild(parseHeadline(headline));
}
@Override
public void visit(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) {
markdownDocument.appendChild(parseParagraph(paragraph));
}
@Override
public void visit(Table table) {
markdownDocument.appendChild(parseTable(table));
}
private static MarkdownRenderer buildRenderer() {
List<Extension> extensions = List.of(TablesExtension.create());
return MarkdownRenderer.builder().extensions(extensions).build();
}
private Block parseTable(Table table) {
// if (table.getNumberOfRows() == 1 && table.getNumberOfCols() == 1) {
// org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph();
// parseTextBlock(table.getTextBlock(), true).forEach(markdownParagraph::appendChild);
// return markdownParagraph;
// }
TableBlock tableNode = new TableBlock();
TableHead head = new TableHead();
TableRow tableRow = createTableRow(table, 0);
head.appendChild(tableRow);
int row = 1;
tableNode.appendChild(head);
TableBody tableBody = new TableBody();
for (; row < table.getNumberOfRows(); row++) {
tableBody.appendChild(createTableRow(table, row));
}
tableNode.appendChild(tableBody);
return tableNode;
}
private TableRow createTableRow(Table table, int row) {
TableRow tableRow = new TableRow();
table.streamRow(row)
.map(this::createTableCell)
.forEach(tableRow::appendChild);
return tableRow;
}
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
var cell = new TableCell();
List<SemanticNode> childNodes = tc.streamChildren()
.toList();
if (childNodes.isEmpty()) {
parseTextBlock(tc.getTextBlock(), false).forEach(cell::appendChild);
} else {
childNodes.forEach(semanticNode -> parseTextBlock(semanticNode.getTextBlock(), false).forEach(cell::appendChild));
}
return cell;
}
private Paragraph parseParagraph(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) {
org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph();
parseTextBlock(paragraph.getTextBlock(), true).forEach(markdownParagraph::appendChild);
return markdownParagraph;
}
private Heading parseHeadline(Headline headline) {
Heading heading = new Heading();
heading.setLevel(headline.getTreeId().size());
heading.appendChild(parseTextBlockAsText(headline.getTextBlock()));
return heading;
}
private Text parseTextBlockAsText(TextBlock textBlock) {
return new Text(textBlock.getSearchText());
}
private List<Node> parseTextBlock(TextBlock textBlock, boolean includeLineBreaks) {
LinkedList<Node> result = new LinkedList<>();
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
String fullText = getText(textBlock, textBlock.getTextRange(), includeLineBreaks);
List<Integer> lineTextSizes = getLineTextSizes(fullText);
int idx = 0;
int charCount = 0;
for (TextRangeWithTextType textRange : textRanges) {
String text = getText(textBlock, textRange.textRange(), includeLineBreaks);
String[] lines = text.split("\n");
for (String line : lines) {
charCount += line.length();
switch (textRange.fontStyle()) {
case REGULAR -> result.add(new Text(line));
case BOLD -> {
StrongEmphasis boldBlock = new StrongEmphasis();
boldBlock.appendChild(new Text(line));
result.add(boldBlock);
}
case ITALIC -> {
Emphasis italicBlock = new Emphasis();
italicBlock.appendChild(new Text(line));
result.add(italicBlock);
}
case BOLD_ITALIC -> {
Emphasis italicBlock = new Emphasis();
StrongEmphasis boldBlock = new StrongEmphasis();
boldBlock.appendChild(new Text(line));
italicBlock.appendChild(boldBlock);
result.add(italicBlock);
}
}
if (includeLineBreaks && lineTextSizes.get(idx).equals(charCount)) {
result.add(new HardLineBreak());
idx++;
}
}
}
if (!result.isEmpty() && result.getLast() instanceof HardLineBreak) {
result.removeLast();
}
return result;
}
private static List<Integer> getLineTextSizes(String input) {
String[] parts = input.split("\n");
List<Integer> textSizes = new ArrayList<>();
int size = 0;
for (int i = 0; i < parts.length; i++) {
size += parts[i].length();
textSizes.add(size);
}
if (textSizes.isEmpty()) {
textSizes.add(0);
}
return textSizes;
}
private static String getText(TextBlock textBlock, TextRange textRange, boolean includeLineBreaks) {
return includeLineBreaks ? textBlock.subSequenceWithLineBreaks(textRange) : textBlock.subSequence(textRange).toString();
}
private List<TextRangeWithTextType> mergeTextStyles(TextBlock textBlock) {
List<TextRangeWithTextType> result = new ArrayList<>();
TreeMap<Integer, Set<FontStyleChange>> styleChanges = new TreeMap<>();
int start = textBlock.getTextRange().start();
int end = textBlock.getTextRange().end();
for (TextRange bold : textBlock.getBoldTextBoundaries()) {
styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
}
for (TextRange italic : textBlock.getItalicTextBoundaries()) {
styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
}
if (styleChanges.isEmpty()) {
result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR));
return result;
}
Set<FontStyle> currentStyles = new HashSet<>();
currentStyles.add(FontStyle.REGULAR);
for (Map.Entry<Integer, Set<FontStyleChange>> entry : styleChanges.entrySet()) {
int point = entry.getKey();
Set<FontStyleChange> changes = entry.getValue();
if (point > start) {
FontStyle style = determineFontStyle(currentStyles);
result.add(new TextRangeWithTextType(new TextRange(start, point), style));
}
changes.stream()
.filter(FontStyleChange::leave)
.map(FontStyleChange::style)
.toList()
.forEach(currentStyles::remove);
currentStyles.addAll(changes.stream()
.filter(FontStyleChange::enter)
.map(FontStyleChange::style)
.toList());
if (currentStyles.isEmpty()) {
currentStyles.add(FontStyle.REGULAR);
}
start = point;
}
if (start < end) {
FontStyle style = determineFontStyle(currentStyles);
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style));
}
return result;
}
private FontStyle determineFontStyle(Set<FontStyle> styles) {
if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) {
return FontStyle.BOLD_ITALIC;
} else if (styles.contains(FontStyle.BOLD)) {
return FontStyle.BOLD;
} else if (styles.contains(FontStyle.ITALIC)) {
return FontStyle.ITALIC;
} else {
return FontStyle.REGULAR;
}
}
enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}
record FontStyleChange(boolean enter, FontStyle style) {
public static FontStyleChange enter(FontStyle style) {
return new FontStyleChange(true, style);
}
public static FontStyleChange leave(FontStyle style) {
return new FontStyleChange(false, style);
}
public boolean leave() {
return !enter;
}
}
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
}
}

View File

@ -0,0 +1,94 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
public abstract class AbstractNodeVisitor implements NodeVisitor {
@Override
public void visit(Document document) {
visitChildren(document);
}
@Override
public void visit(SuperSection superSection) {
visitChildren(superSection);
}
@Override
public void visit(Section section) {
visitChildren(section);
}
@Override
public void visit(Headline headline) {
visitChildren(headline);
}
@Override
public void visit(Paragraph paragraph) {
visitChildren(paragraph);
}
@Override
public void visit(Footer footer) {
visitChildren(footer);
}
@Override
public void visit(Header header) {
visitChildren(header);
}
@Override
public void visit(Image image) {
visitChildren(image);
}
@Override
public void visit(Table table) {
visitChildren(table);
}
@Override
public void visit(TableCell tableCell) {
visitChildren(tableCell);
}
private void visitChildren(SemanticNode semanticNode) {
semanticNode.streamChildren()
.forEach(node -> node.accept(this));
}
}

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -39,7 +40,10 @@ public class DocumentTree {
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
return allEntriesInOrder().map(Entry::getNode)
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@ -113,13 +117,16 @@ public class DocumentTree {
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream().map(Entry::getNode);
return getEntryById(treeId).children.stream()
.map(Entry::getNode);
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
return getEntryById(treeId).children.stream()
.filter(entry -> entry.node.getType().equals(nodeType))
.map(Entry::getNode);
}
@ -156,26 +163,32 @@ public class DocumentTree {
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root).flatMap(DocumentTree::flatten);
return Stream.of(root)
.flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
return getEntryById(parentId).children.stream()
.flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
return String.join("\n",
allEntriesInOrder().map(Entry::toString)
.toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentTree::flatten));
}

View File

@ -0,0 +1,45 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
public interface NodeVisitor {
void visit(Document document);
void visit(SuperSection superSection);
void visit(Section section);
void visit(Headline headline);
void visit(Paragraph paragraph);
void visit(Footer footer);
void visit(Header header);
void visit(Image image);
void visit(Table table);
void visit(TableCell tableCell);
}

View File

@ -79,6 +79,12 @@ public class TextRange implements Comparable<TextRange> {
public boolean contains(int index) {
return start <= index && index <= end;
}
public boolean containsExclusive(int index) {
return start <= index && index < end;
}
@ -91,8 +97,13 @@ public class TextRange implements Comparable<TextRange> {
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
if (splitIndices.stream()
.anyMatch(idx -> !this.containsExclusive(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
splitIndices.stream()
.filter(idx -> !this.containsExclusive(idx))
.toList(),
this));
}
List<TextRange> splitBoundaries = new LinkedList<>();
int previousIndex = start;
@ -109,15 +120,23 @@ public class TextRange implements Comparable<TextRange> {
return splitBoundaries;
}
public IntStream intStream() {
return IntStream.range(start, end);
}
public static TextRange merge(Collection<TextRange> boundaries) {
int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
int minStart = boundaries.stream()
.mapToInt(TextRange::start)
.min()
.orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream()
.mapToInt(TextRange::end)
.max()
.orElseThrow(IllegalArgumentException::new);
return new TextRange(minStart, maxEnd);
}

View File

@ -11,6 +11,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@ -60,8 +61,8 @@ public class Document extends AbstractSemanticNode {
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
@ -101,8 +102,7 @@ public class Document extends AbstractSemanticNode {
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElse(Headline.builder().build());
.findFirst().orElse(Headline.builder().build());
}
@ -163,4 +163,11 @@ public class Document extends AbstractSemanticNode {
return bBox;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -34,6 +35,12 @@ public class Footer extends AbstractSemanticNode {
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -27,6 +28,13 @@ public class Header extends AbstractSemanticNode {
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public NodeType getType() {

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -34,6 +35,13 @@ public class Headline extends AbstractSemanticNode {
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {

View File

@ -7,6 +7,7 @@ import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -86,6 +87,13 @@ public class Image extends AbstractSemanticNode {
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public boolean isLeaf() {

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -34,6 +35,13 @@ public class Paragraph extends AbstractSemanticNode {
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public TextBlock getTextBlock() {

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -26,15 +27,20 @@ public class Section extends AbstractSemanticNode {
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
.findFirst().orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent();
}

View File

@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
public interface SemanticNode {
@ -73,8 +74,7 @@ public interface SemanticNode {
return getTextBlock().getPages()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
.min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
}
@ -254,8 +254,7 @@ public interface SemanticNode {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks()
.get(0).getNumberOnPage();
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
@ -502,4 +501,7 @@ public interface SemanticNode {
return bBoxPerPage;
}
void accept(NodeVisitor visitor);
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -26,8 +27,14 @@ public class SuperSection extends AbstractSemanticNode {
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(() -> getParent().getHeadline());
.findFirst().orElseGet(() -> getParent().getHeadline());
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}

View File

@ -15,6 +15,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
@ -353,4 +354,10 @@ public class Table implements SemanticNode {
return bBoxCache;
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
}

View File

@ -5,6 +5,7 @@ import java.util.HashMap;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
@ -42,6 +43,13 @@ public class TableCell extends AbstractSemanticNode {
}
@Override
public void accept(NodeVisitor visitor) {
visitor.visit(this);
}
@Override
public NodeType getType() {

View File

@ -10,6 +10,8 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
@ -64,6 +66,40 @@ public class AtomicTextBlock implements TextBlock {
}
@Override
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
return "";
}
Set<Integer> lbInBoundary = lineBreaks.stream()
.map(i -> i + this.textRange.start())
.filter(stringTextRange::contains)
.collect(Collectors.toSet());
if (stringTextRange.end() == getTextRange().end()) {
lbInBoundary.add(getTextRange().end());
}
StringBuilder sb = new StringBuilder();
for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) {
char character = this.charAt(i);
if (lbInBoundary.contains(i + 1)) {
// always plus one, due to the linebreaks being an exclusive end index
if (!Character.isWhitespace(character)) {
lbInBoundary.remove(i + 1);
lbInBoundary.add(i + 2);
sb.append(character);
continue;
}
sb.append("\n");
} else {
sb.append(character);
}
}
return sb.toString();
}
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
List<Integer> lineBreaks,
List<TextRange> boldTextBoundaries,

View File

@ -44,7 +44,8 @@ public class ConcatenatedTextBlock implements TextBlock {
this.atomicTextBlocks.add(firstTextBlock);
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
atomicTextBlocks.subList(1, atomicTextBlocks.size())
.forEach(this::concat);
}
@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock {
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getTextRange().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
return atomicTextBlocks.stream()
.filter(textBlock -> textBlock.getTextRange().containsExclusive(stringIdx))
.findAny()
.orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
return atomicTextBlocks.stream().filter(tb -> tb.getTextRange().intersects(textRange)).toList();
return atomicTextBlocks.stream()
.filter(tb -> tb.getTextRange().intersects(textRange))
.toList();
}
@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock {
@Override
public int numberOfLines() {
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
return atomicTextBlocks.stream()
.map(AtomicTextBlock::getLineBreaks)
.mapToInt(List::size).sum();
}
@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock {
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
return getAtomicTextBlocks().stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
.stream())
.toList();
}
@ -165,7 +176,8 @@ public class ConcatenatedTextBlock implements TextBlock {
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
stringTextRange.end())));
return rectanglesPerLinePerPage;
}
@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock {
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
rectangles,
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
.toList()));
return mergedMap;
}
@Override
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
return "";
}
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange);
}
StringBuilder sb = new StringBuilder();
AtomicTextBlock firstTextBlock = textBlocks.get(0);
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
sb.append(textBlock.searchTextWithLineBreaks());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return sb.toString();
}
@Override
public String toString() {
@ -189,14 +232,20 @@ public class ConcatenatedTextBlock implements TextBlock {
@Override
public List<TextRange> getBoldTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getBoldTextBoundaries)
.flatMap(Collection::stream)
.toList();
}
@Override
public List<TextRange> getItalicTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getItalicTextBoundaries)
.flatMap(Collection::stream)
.toList();
}

View File

@ -57,6 +57,14 @@ public interface TextBlock extends CharSequence {
int numberOfLines();
String subSequenceWithLineBreaks(TextRange stringTextRange);
default String searchTextWithLineBreaks() {
return subSequenceWithLineBreaks(getTextRange());
}
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getTextRange().start());
@ -65,7 +73,9 @@ public interface TextBlock extends CharSequence {
default Set<Page> getPages() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
@ -105,7 +115,7 @@ public interface TextBlock extends CharSequence {
default boolean containsIndex(int stringIndex) {
return getTextRange().contains(stringIndex);
return getTextRange().containsExclusive(stringIndex);
}

View File

@ -26,6 +26,7 @@ public class SearchTextWithTextPositionFactory {
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
public static final double LINEBREAK_DELTA_TOLERANCE = 1.05;
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
@ -160,8 +161,8 @@ public class SearchTextWithTextPositionFactory {
return false;
}
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir();
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE;
return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir();
}

View File

@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Disabled
public void testLayoutParserEndToEnd() {
String filePath = "/tmp/OCR_TEST/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf/document.pdf";
String filePath = "files/syngenta/CustomerFiles/Documine/Flora/425_F.1.1.1 - A13617AV - Acute Oral Toxicity Study.pdf";
runForFile(filePath);
}
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
@ -82,6 +82,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_MARKDOWN.md");
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.documentMarkdownFileStorageId().get(), tmpFile);
}

View File

@ -31,7 +31,8 @@ class TextRangeTest {
assertTrue(startTextRange.contains(11));
assertTrue(startTextRange.contains(50));
assertFalse(startTextRange.contains(9));
assertFalse(startTextRange.contains(100));
assertTrue(startTextRange.contains(100));
assertFalse(startTextRange.contains(101));
assertFalse(startTextRange.contains(150));
assertFalse(startTextRange.contains(-123));
assertTrue(startTextRange.contains(new TextRange(11, 99)));
@ -44,6 +45,18 @@ class TextRangeTest {
}
@Test
void testContainsExclusive() {
assertTrue(startTextRange.containsExclusive(11));
assertTrue(startTextRange.containsExclusive(50));
assertFalse(startTextRange.containsExclusive(9));
assertFalse(startTextRange.containsExclusive(100));
assertFalse(startTextRange.containsExclusive(150));
assertFalse(startTextRange.containsExclusive(-123));
}
@Test
void testIntersects() {

View File

@ -62,6 +62,7 @@ public abstract class AbstractTest {
protected final static String TEXT_FILE_ID = "texts";
protected final static String POSITION_FILE_ID = "positions";
protected final static String PAGES_FILE_ID = "pages";
protected final static String MARKDOWN_FILE_ID = "markdown";
protected final static String TENANT_ID = "tenant";
protected final static String VIEWER_DOCUMENT_ID = "viewer";
protected final static String SIMPLIFIED_ID = "simplified";
@ -105,7 +106,7 @@ public abstract class AbstractTest {
}
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
return LayoutParsingRequest.builder()
@ -121,6 +122,7 @@ public abstract class AbstractTest {
.pageFileStorageId(fileName + PAGES_FILE_ID)
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
.documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID))
.build();
}

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:04f3fc00d7e0851c6ee0663ce749562234cc95123ffdd643df88d621e4323ede
size 238546