diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index f39572f..89f98b6 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -19,7 +19,6 @@ public record LayoutParsingRequest( @Schema(description = "Path to the original PDF file.")// @NonNull String originFileStorageId,// - @Schema(description = "Optional Path to the table extraction file.")// Optional tablesFileStorageId,// @Schema(description = "Optional Path to the image classification file.")// @@ -37,9 +36,12 @@ public record LayoutParsingRequest( @NonNull String positionBlockFileStorageId,// @Schema(description = "Path where the Document Pages File will be stored.")// @NonNull String pageFileStorageId,// + @Schema(description = "Path where the Document Markdown File will be stored.")// + Optional documentMarkdownFileStorageId,// @Schema(description = "Path where the Simplified Text File will be stored.")// @NonNull String simplifiedTextStorageId,// @Schema(description = "Path where the Viewer Document PDF will be stored.")// - @NonNull String viewerDocumentStorageId) { + @NonNull String viewerDocumentStorageId +) { } diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index ed30bd3..486a9c9 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -26,4 +26,6 @@ dependencies { implementation("org.springframework.boot:spring-boot-starter-web:3.1.3") implementation("org.jgrapht:jgrapht-core:1.5.2") implementation("org.tinspin:tinspin-indexes:2.1.3") + implementation("org.commonmark:commonmark:0.22.0") + implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 4aa9dd6..d1b504e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownMapper; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; @@ -146,6 +147,9 @@ public class LayoutParsingPipeline { log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); + if(layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { + layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph)); + } layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 18fb95d..9a4e1ae 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -1,9 +1,11 @@ package com.knecon.fforesight.service.layoutparser.processor; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -89,7 +91,7 @@ public class LayoutParsingStorageService { } - @SneakyThrows +@SneakyThrows public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) { try (InputStream inputStream = getObject(storageId)) { @@ -165,4 +167,16 @@ public class LayoutParsingStorageService { } } + + @SneakyThrows + @Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file") + public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) { + + try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) { + + storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream); + } + + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java new file mode 100644 index 0000000..f8239bc --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownMapper.java @@ -0,0 +1,331 @@ +package com.knecon.fforesight.service.layoutparser.processor.markdown; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.commonmark.Extension; +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.ext.gfm.tables.TableCell; +import org.commonmark.ext.gfm.tables.TableHead; +import org.commonmark.ext.gfm.tables.TableRow; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.node.Block; +import org.commonmark.node.Document; +import org.commonmark.node.Emphasis; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.Node; +import org.commonmark.node.Paragraph; +import org.commonmark.node.StrongEmphasis; +import org.commonmark.node.Text; +import org.commonmark.renderer.markdown.MarkdownRenderer; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.AbstractNodeVisitor; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; + +public class MarkdownMapper extends AbstractNodeVisitor { + + Document markdownDocument = new Document(); + + + public String toMarkdownContent(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document document) { + + visit(document); + + return buildRenderer().render(this.markdownDocument); + } + + + @Override + public void visit(Headline headline) { + + markdownDocument.appendChild(parseHeadline(headline)); + } + + + @Override + public void visit(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) { + + markdownDocument.appendChild(parseParagraph(paragraph)); + } + + + @Override + public void visit(Table table) { + + markdownDocument.appendChild(parseTable(table)); + } + + + private static MarkdownRenderer buildRenderer() { + + List extensions = List.of(TablesExtension.create()); + return MarkdownRenderer.builder().extensions(extensions).build(); + } + + + private Block parseTable(Table table) { + +// if (table.getNumberOfRows() == 1 && table.getNumberOfCols() == 1) { +// org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph(); +// parseTextBlock(table.getTextBlock(), true).forEach(markdownParagraph::appendChild); +// return markdownParagraph; +// } + + TableBlock tableNode = new TableBlock(); + TableHead head = new TableHead(); + TableRow tableRow = createTableRow(table, 0); + head.appendChild(tableRow); + int row = 1; + tableNode.appendChild(head); + TableBody tableBody = new TableBody(); + for (; row < table.getNumberOfRows(); row++) { + tableBody.appendChild(createTableRow(table, row)); + } + tableNode.appendChild(tableBody); + return tableNode; + } + + + private TableRow createTableRow(Table table, int row) { + + TableRow tableRow = new TableRow(); + table.streamRow(row) + .map(this::createTableCell) + .forEach(tableRow::appendChild); + return tableRow; + } + + + private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) { + + var cell = new TableCell(); + List childNodes = tc.streamChildren() + .toList(); + if (childNodes.isEmpty()) { + parseTextBlock(tc.getTextBlock(), false).forEach(cell::appendChild); + } else { + childNodes.forEach(semanticNode -> parseTextBlock(semanticNode.getTextBlock(), false).forEach(cell::appendChild)); + } + return cell; + } + + + private Paragraph parseParagraph(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph paragraph) { + + org.commonmark.node.Paragraph markdownParagraph = new org.commonmark.node.Paragraph(); + parseTextBlock(paragraph.getTextBlock(), true).forEach(markdownParagraph::appendChild); + return markdownParagraph; + } + + + private Heading parseHeadline(Headline headline) { + + Heading heading = new Heading(); + heading.setLevel(headline.getTreeId().size()); + heading.appendChild(parseTextBlockAsText(headline.getTextBlock())); + return heading; + + } + + + private Text parseTextBlockAsText(TextBlock textBlock) { + + return new Text(textBlock.getSearchText()); + } + + + private List parseTextBlock(TextBlock textBlock, boolean includeLineBreaks) { + + LinkedList result = new LinkedList<>(); + List textRanges = mergeTextStyles(textBlock); + + String fullText = getText(textBlock, textBlock.getTextRange(), includeLineBreaks); + List lineTextSizes = getLineTextSizes(fullText); + int idx = 0; + int charCount = 0; + for (TextRangeWithTextType textRange : textRanges) { + String text = getText(textBlock, textRange.textRange(), includeLineBreaks); + String[] lines = text.split("\n"); + for (String line : lines) { + charCount += line.length(); + switch (textRange.fontStyle()) { + case REGULAR -> result.add(new Text(line)); + case BOLD -> { + StrongEmphasis boldBlock = new StrongEmphasis(); + boldBlock.appendChild(new Text(line)); + result.add(boldBlock); + } + case ITALIC -> { + Emphasis italicBlock = new Emphasis(); + italicBlock.appendChild(new Text(line)); + result.add(italicBlock); + } + case BOLD_ITALIC -> { + Emphasis italicBlock = new Emphasis(); + + StrongEmphasis boldBlock = new StrongEmphasis(); + boldBlock.appendChild(new Text(line)); + + italicBlock.appendChild(boldBlock); + result.add(italicBlock); + } + } + if (includeLineBreaks && lineTextSizes.get(idx).equals(charCount)) { + result.add(new HardLineBreak()); + idx++; + } + + } + } + if (!result.isEmpty() && result.getLast() instanceof HardLineBreak) { + result.removeLast(); + } + return result; + } + + + private static List getLineTextSizes(String input) { + + String[] parts = input.split("\n"); + List textSizes = new ArrayList<>(); + + int size = 0; + for (int i = 0; i < parts.length; i++) { + size += parts[i].length(); + textSizes.add(size); + } + + if (textSizes.isEmpty()) { + textSizes.add(0); + } + + return textSizes; + } + + + private static String getText(TextBlock textBlock, TextRange textRange, boolean includeLineBreaks) { + + return includeLineBreaks ? textBlock.subSequenceWithLineBreaks(textRange) : textBlock.subSequence(textRange).toString(); + } + + + private List mergeTextStyles(TextBlock textBlock) { + + List result = new ArrayList<>(); + + TreeMap> styleChanges = new TreeMap<>(); + + int start = textBlock.getTextRange().start(); + int end = textBlock.getTextRange().end(); + + for (TextRange bold : textBlock.getBoldTextBoundaries()) { + styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD)); + styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD)); + } + + for (TextRange italic : textBlock.getItalicTextBoundaries()) { + styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC)); + styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC)); + } + + if (styleChanges.isEmpty()) { + result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR)); + return result; + } + + Set currentStyles = new HashSet<>(); + currentStyles.add(FontStyle.REGULAR); + + for (Map.Entry> entry : styleChanges.entrySet()) { + int point = entry.getKey(); + Set changes = entry.getValue(); + + if (point > start) { + FontStyle style = determineFontStyle(currentStyles); + result.add(new TextRangeWithTextType(new TextRange(start, point), style)); + } + + changes.stream() + .filter(FontStyleChange::leave) + .map(FontStyleChange::style) + .toList() + .forEach(currentStyles::remove); + + currentStyles.addAll(changes.stream() + .filter(FontStyleChange::enter) + .map(FontStyleChange::style) + .toList()); + + if (currentStyles.isEmpty()) { + currentStyles.add(FontStyle.REGULAR); + } + + start = point; + } + + if (start < end) { + FontStyle style = determineFontStyle(currentStyles); + result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style)); + } + + return result; + } + + + private FontStyle determineFontStyle(Set styles) { + + if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) { + return FontStyle.BOLD_ITALIC; + } else if (styles.contains(FontStyle.BOLD)) { + return FontStyle.BOLD; + } else if (styles.contains(FontStyle.ITALIC)) { + return FontStyle.ITALIC; + } else { + return FontStyle.REGULAR; + } + } + + + enum FontStyle { + REGULAR, + BOLD, + ITALIC, + BOLD_ITALIC; + } + + record FontStyleChange(boolean enter, FontStyle style) { + + public static FontStyleChange enter(FontStyle style) { + + return new FontStyleChange(true, style); + } + + + public static FontStyleChange leave(FontStyle style) { + + return new FontStyleChange(false, style); + } + + + public boolean leave() { + + return !enter; + } + + } + + record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) { + + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/AbstractNodeVisitor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/AbstractNodeVisitor.java new file mode 100644 index 0000000..c313c41 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/AbstractNodeVisitor.java @@ -0,0 +1,94 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; + +public abstract class AbstractNodeVisitor implements NodeVisitor { + + @Override + public void visit(Document document) { + + visitChildren(document); + } + + + @Override + public void visit(SuperSection superSection) { + + visitChildren(superSection); + } + + + @Override + public void visit(Section section) { + + visitChildren(section); + } + + + @Override + public void visit(Headline headline) { + + visitChildren(headline); + } + + + @Override + public void visit(Paragraph paragraph) { + + visitChildren(paragraph); + } + + + @Override + public void visit(Footer footer) { + + visitChildren(footer); + } + + + @Override + public void visit(Header header) { + + visitChildren(header); + } + + + @Override + public void visit(Image image) { + + visitChildren(image); + } + + + @Override + public void visit(Table table) { + + visitChildren(table); + } + + + @Override + public void visit(TableCell tableCell) { + + visitChildren(tableCell); + } + + + private void visitChildren(SemanticNode semanticNode) { + + semanticNode.streamChildren() + .forEach(node -> node.accept(this)); + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java index a95ee58..b5329bc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph; import static java.lang.String.format; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -39,7 +40,10 @@ public class DocumentTree { public TextBlock buildTextBlock() { - return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + return allEntriesInOrder().map(Entry::getNode) + .filter(SemanticNode::isLeaf) + .map(SemanticNode::getLeafTextBlock) + .collect(new TextBlockCollector()); } @@ -113,13 +117,16 @@ public class DocumentTree { public Stream childNodes(List treeId) { - return getEntryById(treeId).children.stream().map(Entry::getNode); + return getEntryById(treeId).children.stream() + .map(Entry::getNode); } public Stream childNodesOfType(List treeId, NodeType nodeType) { - return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode); + return getEntryById(treeId).children.stream() + .filter(entry -> entry.node.getType().equals(nodeType)) + .map(Entry::getNode); } @@ -156,26 +163,32 @@ public class DocumentTree { public Stream allEntriesInOrder() { - return Stream.of(root).flatMap(DocumentTree::flatten); + return Stream.of(root) + .flatMap(DocumentTree::flatten); } public Stream allSubEntriesInOrder(List parentId) { - return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten); + return getEntryById(parentId).children.stream() + .flatMap(DocumentTree::flatten); } @Override public String toString() { - return String.join("\n", allEntriesInOrder().map(Entry::toString).toList()); + return String.join("\n", + allEntriesInOrder().map(Entry::toString) + .toList()); } private static Stream flatten(Entry entry) { - return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten)); + return Stream.concat(Stream.of(entry), + entry.children.stream() + .flatMap(DocumentTree::flatten)); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/NodeVisitor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/NodeVisitor.java new file mode 100644 index 0000000..38a8bdd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/NodeVisitor.java @@ -0,0 +1,45 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; + +public interface NodeVisitor { + + void visit(Document document); + + + void visit(SuperSection superSection); + + + void visit(Section section); + + + void visit(Headline headline); + + + void visit(Paragraph paragraph); + + + void visit(Footer footer); + + + void visit(Header header); + + + void visit(Image image); + + + void visit(Table table); + + + void visit(TableCell tableCell); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java index 62d41a9..2da6c55 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java @@ -79,6 +79,12 @@ public class TextRange implements Comparable { public boolean contains(int index) { + return start <= index && index <= end; + } + + + public boolean containsExclusive(int index) { + return start <= index && index < end; } @@ -91,8 +97,13 @@ public class TextRange implements Comparable { public List split(List splitIndices) { - if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) { - throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); + if (splitIndices.stream() + .anyMatch(idx -> !this.containsExclusive(idx))) { + throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", + splitIndices.stream() + .filter(idx -> !this.containsExclusive(idx)) + .toList(), + this)); } List splitBoundaries = new LinkedList<>(); int previousIndex = start; @@ -109,15 +120,23 @@ public class TextRange implements Comparable { return splitBoundaries; } + public IntStream intStream() { return IntStream.range(start, end); } + public static TextRange merge(Collection boundaries) { - int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new); - int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new); + int minStart = boundaries.stream() + .mapToInt(TextRange::start) + .min() + .orElseThrow(IllegalArgumentException::new); + int maxEnd = boundaries.stream() + .mapToInt(TextRange::end) + .max() + .orElseThrow(IllegalArgumentException::new); return new TextRange(minStart, maxEnd); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index db3976e..7139969 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -11,6 +11,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; @@ -60,8 +61,8 @@ public class Document extends AbstractSemanticNode { * * @return A list of main sections within the document * @deprecated This method is marked for removal. - * Use {@link #streamChildrenOfType(NodeType)} instead, - * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. + * Use {@link #streamChildrenOfType(NodeType)} instead, + * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. */ @Deprecated(forRemoval = true) public List
getMainSections() { @@ -101,8 +102,7 @@ public class Document extends AbstractSemanticNode { public Headline getHeadline() { return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node) - .findFirst() - .orElse(Headline.builder().build()); + .findFirst().orElse(Headline.builder().build()); } @@ -163,4 +163,11 @@ public class Document extends AbstractSemanticNode { return bBox; } + + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java index ed299d3..de4c73a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -34,6 +35,12 @@ public class Footer extends AbstractSemanticNode { } + @Override + public void accept(NodeVisitor visitor) { + visitor.visit(this); + } + + @Override public TextBlock getTextBlock() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java index b648a8d..e01bdbc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -27,6 +28,13 @@ public class Header extends AbstractSemanticNode { } + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); + } + + @Override public NodeType getType() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java index 00592f3..fe669df 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -34,6 +35,13 @@ public class Headline extends AbstractSemanticNode { } + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); + } + + @Override public TextBlock getTextBlock() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java index 1588324..13afcf3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java @@ -7,6 +7,7 @@ import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -86,6 +87,13 @@ public class Image extends AbstractSemanticNode { } + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); + } + + @Override public boolean isLeaf() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java index 70a5ce1..288ab54 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -34,6 +35,13 @@ public class Paragraph extends AbstractSemanticNode { } + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); + } + + @Override public TextBlock getTextBlock() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index d86d682..b02c814 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -26,15 +27,20 @@ public class Section extends AbstractSemanticNode { public Headline getHeadline() { return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node) - .findFirst() - .orElseGet(() -> getParent().getHeadline()); + .findFirst().orElseGet(() -> getParent().getHeadline()); + } + + + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); } public boolean hasTables() { - return streamAllSubNodesOfType(NodeType.TABLE).findAny() - .isPresent(); + return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index b5c7410..9223cd8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.textbloc import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; public interface SemanticNode { @@ -73,8 +74,7 @@ public interface SemanticNode { return getTextBlock().getPages() .stream() - .min(Comparator.comparingInt(Page::getNumber)) - .orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); + .min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); } @@ -254,8 +254,7 @@ public interface SemanticNode { TextBlock textBlock = getTextBlock(); if (!textBlock.getAtomicTextBlocks().isEmpty()) { - return getTextBlock().getAtomicTextBlocks() - .get(0).getNumberOnPage(); + return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); } else { return -1; } @@ -502,4 +501,7 @@ public interface SemanticNode { return bBoxPerPage; } + + void accept(NodeVisitor visitor); + } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java index 75bb270..662afe3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -26,8 +27,14 @@ public class SuperSection extends AbstractSemanticNode { public Headline getHeadline() { return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node) - .findFirst() - .orElseGet(() -> getParent().getHeadline()); + .findFirst().orElseGet(() -> getParent().getHeadline()); + } + + + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index 9d1e656..82d8539 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -15,6 +15,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; @@ -353,4 +354,10 @@ public class Table implements SemanticNode { return bBoxCache; } + + @Override + public void accept(NodeVisitor visitor) { + visitor.visit(this); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java index 3049460..f6ba77b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java @@ -5,6 +5,7 @@ import java.util.HashMap; import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.NodeVisitor; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; @@ -42,6 +43,13 @@ public class TableCell extends AbstractSemanticNode { } + @Override + public void accept(NodeVisitor visitor) { + + visitor.visit(this); + } + + @Override public NodeType getType() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java index 4749c82..20b328c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java @@ -10,6 +10,8 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; @@ -64,6 +66,40 @@ public class AtomicTextBlock implements TextBlock { } + @Override + public String subSequenceWithLineBreaks(TextRange stringTextRange) { + + if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) { + return ""; + } + + Set lbInBoundary = lineBreaks.stream() + .map(i -> i + this.textRange.start()) + .filter(stringTextRange::contains) + .collect(Collectors.toSet()); + if (stringTextRange.end() == getTextRange().end()) { + lbInBoundary.add(getTextRange().end()); + } + StringBuilder sb = new StringBuilder(); + for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) { + char character = this.charAt(i); + if (lbInBoundary.contains(i + 1)) { + // always plus one, due to the linebreaks being an exclusive end index + if (!Character.isWhitespace(character)) { + lbInBoundary.remove(i + 1); + lbInBoundary.add(i + 2); + sb.append(character); + continue; + } + sb.append("\n"); + } else { + sb.append(character); + } + } + return sb.toString(); + } + + public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText, List lineBreaks, List boldTextBoundaries, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java index c1ad087..fd7b865 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java @@ -44,7 +44,8 @@ public class ConcatenatedTextBlock implements TextBlock { this.atomicTextBlocks.add(firstTextBlock); textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end()); - atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat); + atomicTextBlocks.subList(1, atomicTextBlocks.size()) + .forEach(this::concat); } @@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock { private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) { - return atomicTextBlocks.stream().filter(textBlock -> textBlock.getTextRange().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new); + return atomicTextBlocks.stream() + .filter(textBlock -> textBlock.getTextRange().containsExclusive(stringIdx)) + .findAny() + .orElseThrow(IndexOutOfBoundsException::new); } private List getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) { - return atomicTextBlocks.stream().filter(tb -> tb.getTextRange().intersects(textRange)).toList(); + return atomicTextBlocks.stream() + .filter(tb -> tb.getTextRange().intersects(textRange)) + .toList(); } @@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock { @Override public int numberOfLines() { - return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum(); + return atomicTextBlocks.stream() + .map(AtomicTextBlock::getLineBreaks) + .mapToInt(List::size).sum(); } @@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock { @Override public List getLineBreaks() { - return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList(); + return getAtomicTextBlocks().stream() + .flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks() + .stream()) + .toList(); } @@ -165,7 +176,8 @@ public class ConcatenatedTextBlock implements TextBlock { AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1); rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, - lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); + lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), + stringTextRange.end()))); return rectanglesPerLinePerPage; } @@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock { private Map> mergeEntityPositionsWithSamePageNode(Map> map1, Map> map2) { Map> mergedMap = new HashMap<>(map1); - map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList())); + map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, + rectangles, + (l1, l2) -> Stream.concat(l1.stream(), l2.stream()) + .toList())); return mergedMap; } + @Override + public String subSequenceWithLineBreaks(TextRange stringTextRange) { + + if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) { + return ""; + } + + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); + + if (textBlocks.size() == 1) { + return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange); + } + + StringBuilder sb = new StringBuilder(); + AtomicTextBlock firstTextBlock = textBlocks.get(0); + sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()))); + + for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { + sb.append(textBlock.searchTextWithLineBreaks()); + } + + var lastTextBlock = textBlocks.get(textBlocks.size() - 1); + sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); + + return sb.toString(); + } + + @Override public String toString() { @@ -189,14 +232,20 @@ public class ConcatenatedTextBlock implements TextBlock { @Override public List getBoldTextBoundaries() { - return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList(); + return getAtomicTextBlocks().stream() + .map(AtomicTextBlock::getBoldTextBoundaries) + .flatMap(Collection::stream) + .toList(); } @Override public List getItalicTextBoundaries() { - return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList(); + return getAtomicTextBlocks().stream() + .map(AtomicTextBlock::getItalicTextBoundaries) + .flatMap(Collection::stream) + .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java index 01727ea..678e511 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java @@ -57,6 +57,14 @@ public interface TextBlock extends CharSequence { int numberOfLines(); + String subSequenceWithLineBreaks(TextRange stringTextRange); + + + default String searchTextWithLineBreaks() { + + return subSequenceWithLineBreaks(getTextRange()); + } + default int indexOf(String searchTerm) { return indexOf(searchTerm, getTextRange().start()); @@ -65,7 +73,9 @@ public interface TextBlock extends CharSequence { default Set getPages() { - return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet()); + return getAtomicTextBlocks().stream() + .map(AtomicTextBlock::getPage) + .collect(Collectors.toUnmodifiableSet()); } @@ -105,7 +115,7 @@ public interface TextBlock extends CharSequence { default boolean containsIndex(int stringIndex) { - return getTextRange().contains(stringIndex); + return getTextRange().containsExclusive(stringIndex); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 8acdf00..9fc0381 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -26,6 +26,7 @@ public class SearchTextWithTextPositionFactory { // This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there. // Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3. public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3; + public static final double LINEBREAK_DELTA_TOLERANCE = 1.05; public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { @@ -160,8 +161,8 @@ public class SearchTextWithTextPositionFactory { return false; } - double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); - return deltaY >= currentPosition.getHeightDir(); + double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()) * LINEBREAK_DELTA_TOLERANCE; + return deltaY >= currentPosition.getHeightDir() || deltaY >= previousPosition.getHeightDir(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 331c775..30a3592 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Disabled public void testLayoutParserEndToEnd() { - String filePath = "/tmp/OCR_TEST/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf/document.pdf"; + String filePath = "files/syngenta/CustomerFiles/Documine/Flora/425_F.1.1.1 - A13617AV - Acute Oral Toxicity Study.pdf"; runForFile(filePath); } @@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files"; + String folder = "/Users/maverickstuder/Documents/Fforesight/layoutparser/layoutparser-service/layoutparser-service-server/src/test"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName)) @@ -82,6 +82,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest { assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs(); storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile); + + tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_MARKDOWN.md"); + assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs(); + + storageService.downloadTo(TENANT_ID, layoutParsingRequest.documentMarkdownFileStorageId().get(), tmpFile); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java index 385feb9..fb296a5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java @@ -31,7 +31,8 @@ class TextRangeTest { assertTrue(startTextRange.contains(11)); assertTrue(startTextRange.contains(50)); assertFalse(startTextRange.contains(9)); - assertFalse(startTextRange.contains(100)); + assertTrue(startTextRange.contains(100)); + assertFalse(startTextRange.contains(101)); assertFalse(startTextRange.contains(150)); assertFalse(startTextRange.contains(-123)); assertTrue(startTextRange.contains(new TextRange(11, 99))); @@ -44,6 +45,18 @@ class TextRangeTest { } + @Test + void testContainsExclusive() { + + assertTrue(startTextRange.containsExclusive(11)); + assertTrue(startTextRange.containsExclusive(50)); + assertFalse(startTextRange.containsExclusive(9)); + assertFalse(startTextRange.containsExclusive(100)); + assertFalse(startTextRange.containsExclusive(150)); + assertFalse(startTextRange.containsExclusive(-123)); + } + + @Test void testIntersects() { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index 199f918..961c015 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -62,6 +62,7 @@ public abstract class AbstractTest { protected final static String TEXT_FILE_ID = "texts"; protected final static String POSITION_FILE_ID = "positions"; protected final static String PAGES_FILE_ID = "pages"; + protected final static String MARKDOWN_FILE_ID = "markdown"; protected final static String TENANT_ID = "tenant"; protected final static String VIEWER_DOCUMENT_ID = "viewer"; protected final static String SIMPLIFIED_ID = "simplified"; @@ -105,7 +106,7 @@ public abstract class AbstractTest { } - protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { + public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName); return LayoutParsingRequest.builder() @@ -121,6 +122,7 @@ public abstract class AbstractTest { .pageFileStorageId(fileName + PAGES_FILE_ID) .simplifiedTextStorageId(fileName + SIMPLIFIED_ID) .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID) + .documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID)) .build(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page5_30 - Dicamba - Acute Oral Toxicity - Rats.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page5_30 - Dicamba - Acute Oral Toxicity - Rats.pdf index f78a8f2..defe1a0 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page5_30 - Dicamba - Acute Oral Toxicity - Rats.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Page5_30 - Dicamba - Acute Oral Toxicity - Rats.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/econsulting.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/econsulting.pdf new file mode 100644 index 0000000..9759322 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/econsulting.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04f3fc00d7e0851c6ee0663ce749562234cc95123ffdd643df88d621e4323ede +size 238546