Merge branch 'document-data-markdown' into 'main'
CLARI: document-data-markdown See merge request fforesight/layout-parser!181
This commit is contained in:
commit
370165dc59
@ -19,7 +19,6 @@ public record LayoutParsingRequest(
|
||||
@Schema(description = "Path to the original PDF file.")//
|
||||
@NonNull String originFileStorageId,//
|
||||
|
||||
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
@ -37,9 +36,12 @@ public record LayoutParsingRequest(
|
||||
@NonNull String positionBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||
@NonNull String pageFileStorageId,//
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.")//
|
||||
Optional<String> documentMarkdownFileStorageId,//
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||
@NonNull String simplifiedTextStorageId,//
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||
@NonNull String viewerDocumentStorageId) {
|
||||
@NonNull String viewerDocumentStorageId
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -26,4 +26,6 @@ dependencies {
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
implementation("org.commonmark:commonmark:0.22.0")
|
||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||
}
|
||||
|
||||
@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
@ -146,6 +147,9 @@ public class LayoutParsingPipeline {
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
if(layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
@ -89,7 +91,7 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@SneakyThrows
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
@ -165,4 +167,16 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
|
||||
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
|
||||
|
||||
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,331 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.commonmark.Extension;
|
||||
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||
import org.commonmark.ext.gfm.tables.TableBody;
|
||||
import org.commonmark.ext.gfm.tables.TableCell;
|
||||
import org.commonmark.ext.gfm.tables.TableHead;
|
||||
import org.commonmark.ext.gfm.tables.TableRow;
|
||||
import org.commonmark.ext.gfm.tables.TablesExtension;
|
||||
import org.commonmark.node.Block;
|
||||
import org.commonmark.node.Document;
|
||||
import org.commonmark.node.Emphasis;
|
||||
import org.commonmark.node.HardLineBreak;
|
||||
import org.commonmark.node.Heading;
|
||||
import org.commonmark.node.Node;
|
||||
import org.commonmark.node.Paragraph;
|
||||
import org.commonmark.node.StrongEmphasis;
|
||||
import org.commonmark.node.Text;
|
||||
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.AbstractNodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
public class MarkdownMapper extends AbstractNodeVisitor {
|
||||
|
||||
Document markdownDocument = new Document();
|
||||
|
||||
|
||||
public String toMarkdownContent(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document document) {
|
||||
|
||||
visit(document);
|
||||
|
||||
return buildRenderer().render(this.markdownDocument);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Headline headline) {
|
||||
|
||||
markdownDocument.appendChild(parseHeadline(headline));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) {
|
||||
|
||||
markdownDocument.appendChild(parseParagraph(paragraph));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Table table) {
|
||||
|
||||
markdownDocument.appendChild(parseTable(table));
|
||||
}
|
||||
|
||||
|
||||
private static MarkdownRenderer buildRenderer() {
|
||||
|
||||
List<Extension> extensions = List.of(TablesExtension.create());
|
||||
return MarkdownRenderer.builder().extensions(extensions).build();
|
||||
}
|
||||
|
||||
|
||||
private Block parseTable(Table table) {
|
||||
|
||||
// if (table.getNumberOfRows() == 1 && table.getNumberOfCols() == 1) {
|
||||
// org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph();
|
||||
// parseTextBlock(table.getTextBlock(), true).forEach(markdownParagraph::appendChild);
|
||||
// return markdownParagraph;
|
||||
// }
|
||||
|
||||
TableBlock tableNode = new TableBlock();
|
||||
TableHead head = new TableHead();
|
||||
TableRow tableRow = createTableRow(table, 0);
|
||||
head.appendChild(tableRow);
|
||||
int row = 1;
|
||||
tableNode.appendChild(head);
|
||||
TableBody tableBody = new TableBody();
|
||||
for (; row < table.getNumberOfRows(); row++) {
|
||||
tableBody.appendChild(createTableRow(table, row));
|
||||
}
|
||||
tableNode.appendChild(tableBody);
|
||||
return tableNode;
|
||||
}
|
||||
|
||||
|
||||
private TableRow createTableRow(Table table, int row) {
|
||||
|
||||
TableRow tableRow = new TableRow();
|
||||
table.streamRow(row)
|
||||
.map(this::createTableCell)
|
||||
.forEach(tableRow::appendChild);
|
||||
return tableRow;
|
||||
}
|
||||
|
||||
|
||||
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
|
||||
|
||||
var cell = new TableCell();
|
||||
List<SemanticNode> childNodes = tc.streamChildren()
|
||||
.toList();
|
||||
if (childNodes.isEmpty()) {
|
||||
parseTextBlock(tc.getTextBlock(), false).forEach(cell::appendChild);
|
||||
} else {
|
||||
childNodes.forEach(semanticNode -> parseTextBlock(semanticNode.getTextBlock(), false).forEach(cell::appendChild));
|
||||
}
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
private Paragraph parseParagraph(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) {
|
||||
|
||||
org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph();
|
||||
parseTextBlock(paragraph.getTextBlock(), true).forEach(markdownParagraph::appendChild);
|
||||
return markdownParagraph;
|
||||
}
|
||||
|
||||
|
||||
private Heading parseHeadline(Headline headline) {
|
||||
|
||||
Heading heading = new Heading();
|
||||
heading.setLevel(headline.getTreeId().size());
|
||||
heading.appendChild(parseTextBlockAsText(headline.getTextBlock()));
|
||||
return heading;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Text parseTextBlockAsText(TextBlock textBlock) {
|
||||
|
||||
return new Text(textBlock.getSearchText());
|
||||
}
|
||||
|
||||
|
||||
private List<Node> parseTextBlock(TextBlock textBlock, boolean includeLineBreaks) {
|
||||
|
||||
LinkedList<Node> result = new LinkedList<>();
|
||||
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
|
||||
|
||||
String fullText = getText(textBlock, textBlock.getTextRange(), includeLineBreaks);
|
||||
List<Integer> lineTextSizes = getLineTextSizes(fullText);
|
||||
int idx = 0;
|
||||
int charCount = 0;
|
||||
for (TextRangeWithTextType textRange : textRanges) {
|
||||
String text = getText(textBlock, textRange.textRange(), includeLineBreaks);
|
||||
String[] lines = text.split("\n");
|
||||
for (String line : lines) {
|
||||
charCount += line.length();
|
||||
switch (textRange.fontStyle()) {
|
||||
case REGULAR -> result.add(new Text(line));
|
||||
case BOLD -> {
|
||||
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||
boldBlock.appendChild(new Text(line));
|
||||
result.add(boldBlock);
|
||||
}
|
||||
case ITALIC -> {
|
||||
Emphasis italicBlock = new Emphasis();
|
||||
italicBlock.appendChild(new Text(line));
|
||||
result.add(italicBlock);
|
||||
}
|
||||
case BOLD_ITALIC -> {
|
||||
Emphasis italicBlock = new Emphasis();
|
||||
|
||||
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||
boldBlock.appendChild(new Text(line));
|
||||
|
||||
italicBlock.appendChild(boldBlock);
|
||||
result.add(italicBlock);
|
||||
}
|
||||
}
|
||||
if (includeLineBreaks && lineTextSizes.get(idx).equals(charCount)) {
|
||||
result.add(new HardLineBreak());
|
||||
idx++;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
if (!result.isEmpty() && result.getLast() instanceof HardLineBreak) {
|
||||
result.removeLast();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getLineTextSizes(String input) {
|
||||
|
||||
String[] parts = input.split("\n");
|
||||
List<Integer> textSizes = new ArrayList<>();
|
||||
|
||||
int size = 0;
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
size += parts[i].length();
|
||||
textSizes.add(size);
|
||||
}
|
||||
|
||||
if (textSizes.isEmpty()) {
|
||||
textSizes.add(0);
|
||||
}
|
||||
|
||||
return textSizes;
|
||||
}
|
||||
|
||||
|
||||
private static String getText(TextBlock textBlock, TextRange textRange, boolean includeLineBreaks) {
|
||||
|
||||
return includeLineBreaks ? textBlock.subSequenceWithLineBreaks(textRange) : textBlock.subSequence(textRange).toString();
|
||||
}
|
||||
|
||||
|
||||
private List<TextRangeWithTextType> mergeTextStyles(TextBlock textBlock) {
|
||||
|
||||
List<TextRangeWithTextType> result = new ArrayList<>();
|
||||
|
||||
TreeMap<Integer, Set<FontStyleChange>> styleChanges = new TreeMap<>();
|
||||
|
||||
int start = textBlock.getTextRange().start();
|
||||
int end = textBlock.getTextRange().end();
|
||||
|
||||
for (TextRange bold : textBlock.getBoldTextBoundaries()) {
|
||||
styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
|
||||
styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
|
||||
}
|
||||
|
||||
for (TextRange italic : textBlock.getItalicTextBoundaries()) {
|
||||
styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
|
||||
styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
|
||||
}
|
||||
|
||||
if (styleChanges.isEmpty()) {
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR));
|
||||
return result;
|
||||
}
|
||||
|
||||
Set<FontStyle> currentStyles = new HashSet<>();
|
||||
currentStyles.add(FontStyle.REGULAR);
|
||||
|
||||
for (Map.Entry<Integer, Set<FontStyleChange>> entry : styleChanges.entrySet()) {
|
||||
int point = entry.getKey();
|
||||
Set<FontStyleChange> changes = entry.getValue();
|
||||
|
||||
if (point > start) {
|
||||
FontStyle style = determineFontStyle(currentStyles);
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, point), style));
|
||||
}
|
||||
|
||||
changes.stream()
|
||||
.filter(FontStyleChange::leave)
|
||||
.map(FontStyleChange::style)
|
||||
.toList()
|
||||
.forEach(currentStyles::remove);
|
||||
|
||||
currentStyles.addAll(changes.stream()
|
||||
.filter(FontStyleChange::enter)
|
||||
.map(FontStyleChange::style)
|
||||
.toList());
|
||||
|
||||
if (currentStyles.isEmpty()) {
|
||||
currentStyles.add(FontStyle.REGULAR);
|
||||
}
|
||||
|
||||
start = point;
|
||||
}
|
||||
|
||||
if (start < end) {
|
||||
FontStyle style = determineFontStyle(currentStyles);
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private FontStyle determineFontStyle(Set<FontStyle> styles) {
|
||||
|
||||
if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) {
|
||||
return FontStyle.BOLD_ITALIC;
|
||||
} else if (styles.contains(FontStyle.BOLD)) {
|
||||
return FontStyle.BOLD;
|
||||
} else if (styles.contains(FontStyle.ITALIC)) {
|
||||
return FontStyle.ITALIC;
|
||||
} else {
|
||||
return FontStyle.REGULAR;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
enum FontStyle {
|
||||
REGULAR,
|
||||
BOLD,
|
||||
ITALIC,
|
||||
BOLD_ITALIC;
|
||||
}
|
||||
|
||||
record FontStyleChange(boolean enter, FontStyle style) {
|
||||
|
||||
public static FontStyleChange enter(FontStyle style) {
|
||||
|
||||
return new FontStyleChange(true, style);
|
||||
}
|
||||
|
||||
|
||||
public static FontStyleChange leave(FontStyle style) {
|
||||
|
||||
return new FontStyleChange(false, style);
|
||||
}
|
||||
|
||||
|
||||
public boolean leave() {
|
||||
|
||||
return !enter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
|
||||
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,94 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
|
||||
public abstract class AbstractNodeVisitor implements NodeVisitor {
|
||||
|
||||
@Override
|
||||
public void visit(Document document) {
|
||||
|
||||
visitChildren(document);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(SuperSection superSection) {
|
||||
|
||||
visitChildren(superSection);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Section section) {
|
||||
|
||||
visitChildren(section);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Headline headline) {
|
||||
|
||||
visitChildren(headline);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Paragraph paragraph) {
|
||||
|
||||
visitChildren(paragraph);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Footer footer) {
|
||||
|
||||
visitChildren(footer);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Header header) {
|
||||
|
||||
visitChildren(header);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Image image) {
|
||||
|
||||
visitChildren(image);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Table table) {
|
||||
|
||||
visitChildren(table);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(TableCell tableCell) {
|
||||
|
||||
visitChildren(tableCell);
|
||||
}
|
||||
|
||||
|
||||
private void visitChildren(SemanticNode semanticNode) {
|
||||
|
||||
semanticNode.streamChildren()
|
||||
.forEach(node -> node.accept(this));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -39,7 +40,10 @@ public class DocumentTree {
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
return allEntriesInOrder().map(Entry::getNode)
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -113,13 +117,16 @@ public class DocumentTree {
|
||||
|
||||
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(treeId).children.stream().map(Entry::getNode);
|
||||
return getEntryById(treeId).children.stream()
|
||||
.map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||
|
||||
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
|
||||
return getEntryById(treeId).children.stream()
|
||||
.filter(entry -> entry.node.getType().equals(nodeType))
|
||||
.map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -156,26 +163,32 @@ public class DocumentTree {
|
||||
|
||||
public Stream<Entry> allEntriesInOrder() {
|
||||
|
||||
return Stream.of(root).flatMap(DocumentTree::flatten);
|
||||
return Stream.of(root)
|
||||
.flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
|
||||
return getEntryById(parentId).children.stream()
|
||||
.flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
|
||||
return String.join("\n",
|
||||
allEntriesInOrder().map(Entry::toString)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
|
||||
return Stream.concat(Stream.of(entry),
|
||||
entry.children.stream()
|
||||
.flatMap(DocumentTree::flatten));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
|
||||
public interface NodeVisitor {
|
||||
|
||||
void visit(Document document);
|
||||
|
||||
|
||||
void visit(SuperSection superSection);
|
||||
|
||||
|
||||
void visit(Section section);
|
||||
|
||||
|
||||
void visit(Headline headline);
|
||||
|
||||
|
||||
void visit(Paragraph paragraph);
|
||||
|
||||
|
||||
void visit(Footer footer);
|
||||
|
||||
|
||||
void visit(Header header);
|
||||
|
||||
|
||||
void visit(Image image);
|
||||
|
||||
|
||||
void visit(Table table);
|
||||
|
||||
|
||||
void visit(TableCell tableCell);
|
||||
|
||||
}
|
||||
@ -79,6 +79,12 @@ public class TextRange implements Comparable<TextRange> {
|
||||
|
||||
public boolean contains(int index) {
|
||||
|
||||
return start <= index && index <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containsExclusive(int index) {
|
||||
|
||||
return start <= index && index < end;
|
||||
}
|
||||
|
||||
@ -91,8 +97,13 @@ public class TextRange implements Comparable<TextRange> {
|
||||
|
||||
public List<TextRange> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
if (splitIndices.stream()
|
||||
.anyMatch(idx -> !this.containsExclusive(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
|
||||
splitIndices.stream()
|
||||
.filter(idx -> !this.containsExclusive(idx))
|
||||
.toList(),
|
||||
this));
|
||||
}
|
||||
List<TextRange> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
@ -109,15 +120,23 @@ public class TextRange implements Comparable<TextRange> {
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
public IntStream intStream() {
|
||||
|
||||
return IntStream.range(start, end);
|
||||
}
|
||||
|
||||
|
||||
public static TextRange merge(Collection<TextRange> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
int minStart = boundaries.stream()
|
||||
.mapToInt(TextRange::start)
|
||||
.min()
|
||||
.orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream()
|
||||
.mapToInt(TextRange::end)
|
||||
.max()
|
||||
.orElseThrow(IllegalArgumentException::new);
|
||||
return new TextRange(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@ -11,6 +11,7 @@ import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
@ -60,8 +61,8 @@ public class Document extends AbstractSemanticNode {
|
||||
*
|
||||
* @return A list of main sections within the document
|
||||
* @deprecated This method is marked for removal.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
*/
|
||||
@Deprecated(forRemoval = true)
|
||||
public List<Section> getMainSections() {
|
||||
@ -101,8 +102,7 @@ public class Document extends AbstractSemanticNode {
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElse(Headline.builder().build());
|
||||
.findFirst().orElse(Headline.builder().build());
|
||||
}
|
||||
|
||||
|
||||
@ -163,4 +163,11 @@ public class Document extends AbstractSemanticNode {
|
||||
return bBox;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -34,6 +35,12 @@ public class Footer extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -27,6 +28,13 @@ public class Header extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -34,6 +35,13 @@ public class Headline extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -86,6 +87,13 @@ public class Image extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -34,6 +35,13 @@ public class Paragraph extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -26,15 +27,20 @@ public class Section extends AbstractSemanticNode {
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
.findFirst().orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||
.isPresent();
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
@ -73,8 +74,7 @@ public interface SemanticNode {
|
||||
|
||||
return getTextBlock().getPages()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
.min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
}
|
||||
|
||||
|
||||
@ -254,8 +254,7 @@ public interface SemanticNode {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
||||
return getTextBlock().getAtomicTextBlocks()
|
||||
.get(0).getNumberOnPage();
|
||||
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@ -502,4 +501,7 @@ public interface SemanticNode {
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
void accept(NodeVisitor visitor);
|
||||
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -26,8 +27,14 @@ public class SuperSection extends AbstractSemanticNode {
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
.findFirst().orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -15,6 +15,7 @@ import java.util.stream.Stream;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
@ -353,4 +354,10 @@ public class Table implements SemanticNode {
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -42,6 +43,13 @@ public class TableCell extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void accept(NodeVisitor visitor) {
|
||||
|
||||
visitor.visit(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
|
||||
@ -10,6 +10,8 @@ import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
@ -64,6 +66,40 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
|
||||
|
||||
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
Set<Integer> lbInBoundary = lineBreaks.stream()
|
||||
.map(i -> i + this.textRange.start())
|
||||
.filter(stringTextRange::contains)
|
||||
.collect(Collectors.toSet());
|
||||
if (stringTextRange.end() == getTextRange().end()) {
|
||||
lbInBoundary.add(getTextRange().end());
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) {
|
||||
char character = this.charAt(i);
|
||||
if (lbInBoundary.contains(i + 1)) {
|
||||
// always plus one, due to the linebreaks being an exclusive end index
|
||||
if (!Character.isWhitespace(character)) {
|
||||
lbInBoundary.remove(i + 1);
|
||||
lbInBoundary.add(i + 2);
|
||||
sb.append(character);
|
||||
continue;
|
||||
}
|
||||
sb.append("\n");
|
||||
} else {
|
||||
sb.append(character);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||
List<Integer> lineBreaks,
|
||||
List<TextRange> boldTextBoundaries,
|
||||
|
||||
@ -44,7 +44,8 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
this.atomicTextBlocks.add(firstTextBlock);
|
||||
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
|
||||
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size())
|
||||
.forEach(this::concat);
|
||||
}
|
||||
|
||||
|
||||
@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getTextRange().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
return atomicTextBlocks.stream()
|
||||
.filter(textBlock -> textBlock.getTextRange().containsExclusive(stringIdx))
|
||||
.findAny()
|
||||
.orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(tb -> tb.getTextRange().intersects(textRange)).toList();
|
||||
return atomicTextBlocks.stream()
|
||||
.filter(tb -> tb.getTextRange().intersects(textRange))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
||||
return atomicTextBlocks.stream()
|
||||
.map(AtomicTextBlock::getLineBreaks)
|
||||
.mapToInt(List::size).sum();
|
||||
}
|
||||
|
||||
|
||||
@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
@Override
|
||||
public List<Integer> getLineBreaks() {
|
||||
|
||||
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
|
||||
return getAtomicTextBlocks().stream()
|
||||
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
|
||||
.stream())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -165,7 +176,8 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
||||
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
|
||||
stringTextRange.end())));
|
||||
|
||||
return rectanglesPerLinePerPage;
|
||||
}
|
||||
@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
||||
|
||||
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
||||
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
|
||||
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
|
||||
rectangles,
|
||||
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
|
||||
.toList()));
|
||||
return mergedMap;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
|
||||
|
||||
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange);
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
sb.append(textBlock.searchTextWithLineBreaks());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
@ -189,14 +232,20 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
@Override
|
||||
public List<TextRange> getBoldTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
||||
return getAtomicTextBlocks().stream()
|
||||
.map(AtomicTextBlock::getBoldTextBoundaries)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<TextRange> getItalicTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
||||
return getAtomicTextBlocks().stream()
|
||||
.map(AtomicTextBlock::getItalicTextBoundaries)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -57,6 +57,14 @@ public interface TextBlock extends CharSequence {
|
||||
int numberOfLines();
|
||||
|
||||
|
||||
String subSequenceWithLineBreaks(TextRange stringTextRange);
|
||||
|
||||
|
||||
default String searchTextWithLineBreaks() {
|
||||
|
||||
return subSequenceWithLineBreaks(getTextRange());
|
||||
}
|
||||
|
||||
default int indexOf(String searchTerm) {
|
||||
|
||||
return indexOf(searchTerm, getTextRange().start());
|
||||
@ -65,7 +73,9 @@ public interface TextBlock extends CharSequence {
|
||||
|
||||
default Set<Page> getPages() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
|
||||
return getAtomicTextBlocks().stream()
|
||||
.map(AtomicTextBlock::getPage)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
@ -105,7 +115,7 @@ public interface TextBlock extends CharSequence {
|
||||
|
||||
default boolean containsIndex(int stringIndex) {
|
||||
|
||||
return getTextRange().contains(stringIndex);
|
||||
return getTextRange().containsExclusive(stringIndex);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
// This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there.
|
||||
// Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3.
|
||||
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||
public static final double LINEBREAK_DELTA_TOLERANCE = 1.05;
|
||||
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
||||
@ -160,8 +161,8 @@ public class SearchTextWithTextPositionFactory {
|
||||
return false;
|
||||
}
|
||||
|
||||
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||
return deltaY >= currentPosition.getHeightDir();
|
||||
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE;
|
||||
return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/tmp/OCR_TEST/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf/document.pdf";
|
||||
String filePath = "files/syngenta/CustomerFiles/Documine/Flora/425_F.1.1.1 - A13617AV - Acute Oral Toxicity Study.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
||||
String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -82,6 +82,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||
|
||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
||||
|
||||
tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_MARKDOWN.md");
|
||||
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||
|
||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.documentMarkdownFileStorageId().get(), tmpFile);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -31,7 +31,8 @@ class TextRangeTest {
|
||||
assertTrue(startTextRange.contains(11));
|
||||
assertTrue(startTextRange.contains(50));
|
||||
assertFalse(startTextRange.contains(9));
|
||||
assertFalse(startTextRange.contains(100));
|
||||
assertTrue(startTextRange.contains(100));
|
||||
assertFalse(startTextRange.contains(101));
|
||||
assertFalse(startTextRange.contains(150));
|
||||
assertFalse(startTextRange.contains(-123));
|
||||
assertTrue(startTextRange.contains(new TextRange(11, 99)));
|
||||
@ -44,6 +45,18 @@ class TextRangeTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testContainsExclusive() {
|
||||
|
||||
assertTrue(startTextRange.containsExclusive(11));
|
||||
assertTrue(startTextRange.containsExclusive(50));
|
||||
assertFalse(startTextRange.containsExclusive(9));
|
||||
assertFalse(startTextRange.containsExclusive(100));
|
||||
assertFalse(startTextRange.containsExclusive(150));
|
||||
assertFalse(startTextRange.containsExclusive(-123));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testIntersects() {
|
||||
|
||||
|
||||
@ -62,6 +62,7 @@ public abstract class AbstractTest {
|
||||
protected final static String TEXT_FILE_ID = "texts";
|
||||
protected final static String POSITION_FILE_ID = "positions";
|
||||
protected final static String PAGES_FILE_ID = "pages";
|
||||
protected final static String MARKDOWN_FILE_ID = "markdown";
|
||||
protected final static String TENANT_ID = "tenant";
|
||||
protected final static String VIEWER_DOCUMENT_ID = "viewer";
|
||||
protected final static String SIMPLIFIED_ID = "simplified";
|
||||
@ -105,7 +106,7 @@ public abstract class AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
|
||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||
return LayoutParsingRequest.builder()
|
||||
@ -121,6 +122,7 @@ public abstract class AbstractTest {
|
||||
.pageFileStorageId(fileName + PAGES_FILE_ID)
|
||||
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||
.documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
Binary file not shown.
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:04f3fc00d7e0851c6ee0663ce749562234cc95123ffdd643df88d621e4323ede
|
||||
size 238546
|
||||
Loading…
x
Reference in New Issue
Block a user