Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9864d81d9d | ||
|
|
251c84f884 |
@ -0,0 +1,229 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBody;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableCell;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableHead;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableRow;
|
||||||
|
import org.commonmark.node.Document;
|
||||||
|
import org.commonmark.node.Emphasis;
|
||||||
|
import org.commonmark.node.Heading;
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
import org.commonmark.node.StrongEmphasis;
|
||||||
|
import org.commonmark.node.Text;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class DocumentDataParser {
|
||||||
|
|
||||||
|
public Document parse(Stream<SemanticNode> semanticNodes) {
|
||||||
|
|
||||||
|
Document document = new Document();
|
||||||
|
semanticNodes.map(DocumentDataParser::parseNode)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.forEach(document::appendChild);
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Node parseNode(SemanticNode semanticNode) {
|
||||||
|
|
||||||
|
return switch (semanticNode.getType()) {
|
||||||
|
case HEADLINE -> parseHeadline((Headline) semanticNode);
|
||||||
|
case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
|
||||||
|
case TABLE -> parseTable((Table) semanticNode);
|
||||||
|
default -> null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TableBlock parseTable(Table table) {
|
||||||
|
|
||||||
|
TableBlock tableNode = new TableBlock();
|
||||||
|
TableHead head = new TableHead();
|
||||||
|
TableRow tableRow = createTableRow(table, 0);
|
||||||
|
head.appendChild(tableRow);
|
||||||
|
int row = 1;
|
||||||
|
for (; row < table.getNumberOfRows() && table.streamRow(row)
|
||||||
|
.allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) {
|
||||||
|
head.appendChild(createTableRow(table, row));
|
||||||
|
}
|
||||||
|
tableNode.appendChild(head);
|
||||||
|
TableBody tableBody = new TableBody();
|
||||||
|
for (; row < table.getNumberOfRows(); row++) {
|
||||||
|
tableBody.appendChild(createTableRow(table, row));
|
||||||
|
}
|
||||||
|
tableNode.appendChild(tableBody);
|
||||||
|
return tableNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TableRow createTableRow(Table table, int row) {
|
||||||
|
|
||||||
|
TableRow tableRow = new TableRow();
|
||||||
|
table.streamRow(row)
|
||||||
|
.map(DocumentDataParser::createTableCell)
|
||||||
|
.forEach(tableRow::appendChild);
|
||||||
|
return tableRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
|
||||||
|
|
||||||
|
var cell = new TableCell();
|
||||||
|
if (tc.isLeaf()) {
|
||||||
|
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
|
||||||
|
} else {
|
||||||
|
tc.streamChildren()
|
||||||
|
.map(DocumentDataParser::parseNode)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.forEach(cell::appendChild);
|
||||||
|
}
|
||||||
|
return cell;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
|
||||||
|
|
||||||
|
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
|
||||||
|
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
|
||||||
|
return heading;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Heading parseHeadline(Headline headline) {
|
||||||
|
|
||||||
|
Heading heading = new Heading();
|
||||||
|
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
|
||||||
|
return heading;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Node> parseTextBlock(TextBlock textBlock) {
|
||||||
|
|
||||||
|
List<Node> result = new ArrayList<>();
|
||||||
|
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
|
||||||
|
for (TextRangeWithTextType textRange : textRanges) {
|
||||||
|
switch (textRange.fontStyle()) {
|
||||||
|
case REGULAR -> result.add(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||||
|
case BOLD -> {
|
||||||
|
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||||
|
boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||||
|
result.add(boldBlock);
|
||||||
|
}
|
||||||
|
case ITALIC -> {
|
||||||
|
Emphasis italicBlock = new Emphasis();
|
||||||
|
italicBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||||
|
result.add(italicBlock);
|
||||||
|
}
|
||||||
|
case BOLD_ITALIC -> {
|
||||||
|
Emphasis italicBlock = new Emphasis();
|
||||||
|
|
||||||
|
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||||
|
boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||||
|
|
||||||
|
italicBlock.appendChild(boldBlock);
|
||||||
|
result.add(italicBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextRangeWithTextType> mergeTextStyles(TextBlock textBlock) {
|
||||||
|
|
||||||
|
List<TextRangeWithTextType> result = new ArrayList<>();
|
||||||
|
|
||||||
|
TreeMap<Integer, Set<FontStyle>> styleChanges = new TreeMap<>();
|
||||||
|
|
||||||
|
for (TextRange bold : textBlock.getBoldTextBoundaries()) {
|
||||||
|
styleChanges.computeIfAbsent(bold.start(), k -> new HashSet<>()).add(FontStyle.BOLD);
|
||||||
|
styleChanges.computeIfAbsent(bold.end(), k -> new HashSet<>()).add(FontStyle.REGULAR);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (TextRange italic : textBlock.getItalicTextBoundaries()) {
|
||||||
|
styleChanges.computeIfAbsent(italic.start(), k -> new HashSet<>()).add(FontStyle.ITALIC);
|
||||||
|
styleChanges.computeIfAbsent(italic.end(), k -> new HashSet<>()).add(FontStyle.REGULAR);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (styleChanges.isEmpty()) {
|
||||||
|
result.add(new TextRangeWithTextType(new TextRange(0, textBlock.length()), FontStyle.REGULAR));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int start = 0;
|
||||||
|
Set<FontStyle> currentStyles = new HashSet<>();
|
||||||
|
currentStyles.add(FontStyle.REGULAR);
|
||||||
|
|
||||||
|
for (Map.Entry<Integer, Set<FontStyle>> entry : styleChanges.entrySet()) {
|
||||||
|
int point = entry.getKey();
|
||||||
|
Set<FontStyle> changes = entry.getValue();
|
||||||
|
|
||||||
|
if (point > start) {
|
||||||
|
FontStyle style = determineFontStyle(currentStyles);
|
||||||
|
result.add(new TextRangeWithTextType(new TextRange(start, point), style));
|
||||||
|
}
|
||||||
|
|
||||||
|
currentStyles.removeAll(changes);
|
||||||
|
currentStyles.addAll(changes);
|
||||||
|
if (currentStyles.isEmpty()) {
|
||||||
|
currentStyles.add(FontStyle.REGULAR);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = point;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start < textBlock.length()) {
|
||||||
|
FontStyle style = determineFontStyle(currentStyles);
|
||||||
|
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.length()), style));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private FontStyle determineFontStyle(Set<FontStyle> styles) {
|
||||||
|
|
||||||
|
if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) {
|
||||||
|
return FontStyle.BOLD_ITALIC;
|
||||||
|
} else if (styles.contains(FontStyle.BOLD)) {
|
||||||
|
return FontStyle.BOLD;
|
||||||
|
} else if (styles.contains(FontStyle.ITALIC)) {
|
||||||
|
return FontStyle.ITALIC;
|
||||||
|
} else {
|
||||||
|
return FontStyle.REGULAR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum FontStyle {
|
||||||
|
REGULAR,
|
||||||
|
BOLD,
|
||||||
|
ITALIC,
|
||||||
|
BOLD_ITALIC;
|
||||||
|
}
|
||||||
|
|
||||||
|
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -13,13 +13,13 @@ import lombok.Setter;
|
|||||||
@Setter
|
@Setter
|
||||||
@EqualsAndHashCode
|
@EqualsAndHashCode
|
||||||
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
||||||
public class Boundary implements Comparable<Boundary> {
|
public class TextRange implements Comparable<TextRange> {
|
||||||
|
|
||||||
private int start;
|
private int start;
|
||||||
private int end;
|
private int end;
|
||||||
|
|
||||||
|
|
||||||
public Boundary(int start, int end) {
|
public TextRange(int start, int end) {
|
||||||
|
|
||||||
if (start > end) {
|
if (start > end) {
|
||||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||||
@ -47,15 +47,15 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Boundary boundary) {
|
public boolean contains(TextRange textRange) {
|
||||||
|
|
||||||
return start <= boundary.start() && boundary.end() <= end;
|
return start <= textRange.start() && textRange.end() <= end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean containedBy(Boundary boundary) {
|
public boolean containedBy(TextRange textRange) {
|
||||||
|
|
||||||
return boundary.contains(this);
|
return textRange.contains(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -83,18 +83,18 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(Boundary boundary) {
|
public boolean intersects(TextRange textRange) {
|
||||||
|
|
||||||
return boundary.start() < this.end && this.start < boundary.end();
|
return textRange.start() < this.end && this.start < textRange.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Boundary> split(List<Integer> splitIndices) {
|
public List<TextRange> split(List<Integer> splitIndices) {
|
||||||
|
|
||||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||||
}
|
}
|
||||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
List<TextRange> splitBoundaries = new LinkedList<>();
|
||||||
int previousIndex = start;
|
int previousIndex = start;
|
||||||
for (int splitIndex : splitIndices) {
|
for (int splitIndex : splitIndices) {
|
||||||
|
|
||||||
@ -102,10 +102,10 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
if (splitIndex == previousIndex) {
|
if (splitIndex == previousIndex) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
||||||
previousIndex = splitIndex;
|
previousIndex = splitIndex;
|
||||||
}
|
}
|
||||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||||
return splitBoundaries;
|
return splitBoundaries;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,11 +114,11 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
return IntStream.range(start, end);
|
return IntStream.range(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
public static TextRange merge(Collection<TextRange> boundaries) {
|
||||||
|
|
||||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
|
||||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
|
||||||
return new Boundary(minStart, maxEnd);
|
return new TextRange(minStart, maxEnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -130,12 +130,12 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(Boundary boundary) {
|
public int compareTo(TextRange textRange) {
|
||||||
|
|
||||||
if (end < boundary.end() && start < boundary.start()) {
|
if (end < textRange.end() && start < textRange.start()) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (start > boundary.start() && end > boundary.end()) {
|
if (start > textRange.start() && end > textRange.end()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -11,7 +11,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||||
@ -32,7 +32,7 @@ public class RedactionEntity {
|
|||||||
|
|
||||||
// initial values
|
// initial values
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
final Boundary boundary;
|
final TextRange textRange;
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
final String type;
|
final String type;
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
@ -66,9 +66,9 @@ public class RedactionEntity {
|
|||||||
SemanticNode deepestFullyContainingNode;
|
SemanticNode deepestFullyContainingNode;
|
||||||
|
|
||||||
|
|
||||||
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
public static RedactionEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) {
|
||||||
|
|
||||||
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
return RedactionEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -132,7 +132,7 @@ public class RedactionEntity {
|
|||||||
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||||
|
|
||||||
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
|
||||||
|
|
||||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||||
.stream()
|
.stream()
|
||||||
@ -157,19 +157,19 @@ public class RedactionEntity {
|
|||||||
|
|
||||||
public boolean containedBy(RedactionEntity redactionEntity) {
|
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
return this.boundary.containedBy(redactionEntity.getBoundary());
|
return this.textRange.containedBy(redactionEntity.getTextRange());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(RedactionEntity redactionEntity) {
|
public boolean contains(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
return this.boundary.contains(redactionEntity.getBoundary());
|
return this.textRange.contains(redactionEntity.getTextRange());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(RedactionEntity redactionEntity) {
|
public boolean intersects(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
return this.boundary.intersects(redactionEntity.getBoundary());
|
return this.textRange.intersects(redactionEntity.getTextRange());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -210,7 +210,7 @@ public class RedactionEntity {
|
|||||||
sb.append("Entity[\"");
|
sb.append("Entity[\"");
|
||||||
sb.append(value);
|
sb.append(value);
|
||||||
sb.append("\", ");
|
sb.append("\", ");
|
||||||
sb.append(boundary);
|
sb.append(textRange);
|
||||||
sb.append(", pages[");
|
sb.append(", pages[");
|
||||||
pages.forEach(page -> {
|
pages.forEach(page -> {
|
||||||
sb.append(page.getNumber());
|
sb.append(page.getNumber());
|
||||||
|
|||||||
@ -14,7 +14,7 @@ import java.util.stream.Stream;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
@ -77,12 +77,12 @@ public interface SemanticNode {
|
|||||||
*
|
*
|
||||||
* @return Set of PageNodes this node appears on.
|
* @return Set of PageNodes this node appears on.
|
||||||
*/
|
*/
|
||||||
default Set<Page> getPages(Boundary boundary) {
|
default Set<Page> getPages(TextRange textRange) {
|
||||||
|
|
||||||
if (!getBoundary().contains(boundary)) {
|
if (!getBoundary().contains(textRange)) {
|
||||||
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
|
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary()));
|
||||||
}
|
}
|
||||||
return getTextBlock().getPages(boundary);
|
return getTextBlock().getPages(textRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -328,13 +328,13 @@ public interface SemanticNode {
|
|||||||
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
|
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
TextBlock textBlock = getTextBlock();
|
TextBlock textBlock = getTextBlock();
|
||||||
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
|
if (textBlock.getTextRange().intersects(redactionEntity.getTextRange())) {
|
||||||
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
|
if (textBlock.containsBoundary(redactionEntity.getTextRange())) {
|
||||||
redactionEntity.setDeepestFullyContainingNode(this);
|
redactionEntity.setDeepestFullyContainingNode(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
redactionEntity.addIntersectingNode(this);
|
redactionEntity.addIntersectingNode(this);
|
||||||
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
|
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getTextRange()))
|
||||||
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
|
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -406,9 +406,9 @@ public interface SemanticNode {
|
|||||||
*
|
*
|
||||||
* @return Boundary of this Node's TextBlock
|
* @return Boundary of this Node's TextBlock
|
||||||
*/
|
*/
|
||||||
default Boundary getBoundary() {
|
default TextRange getBoundary() {
|
||||||
|
|
||||||
return getTextBlock().getBoundary();
|
return getTextBlock().getTextRange();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,10 +10,12 @@ import java.util.Collections;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
@ -36,14 +38,14 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
Page page;
|
Page page;
|
||||||
|
|
||||||
//string coordinates
|
//string coordinates
|
||||||
Boundary boundary;
|
TextRange textRange;
|
||||||
String searchText;
|
String searchText;
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<Integer> lineBreaks = new ArrayList<>();
|
List<Integer> lineBreaks = new ArrayList<>();
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<Boundary> boldTextBoundaries = new ArrayList<>();
|
List<TextRange> boldTextBoundaries = new ArrayList<>();
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<Boundary> italicTextBoundaries = new ArrayList<>();
|
List<TextRange> italicTextBoundaries = new ArrayList<>();
|
||||||
String orientation;
|
String orientation;
|
||||||
int textDirection;
|
int textDirection;
|
||||||
|
|
||||||
@ -64,10 +66,44 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
|
||||||
|
|
||||||
|
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<Integer> lbInBoundary = lineBreaks.stream()
|
||||||
|
.map(i -> i + stringTextRange.start())
|
||||||
|
.filter(stringTextRange::contains)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
if (stringTextRange.end() == getTextRange().end()) {
|
||||||
|
lbInBoundary.add(getTextRange().end());
|
||||||
|
}
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) {
|
||||||
|
char character = this.charAt(i);
|
||||||
|
if (lbInBoundary.contains(i + 1)) {
|
||||||
|
// always plus one, due to the linebreaks being an exclusive end index
|
||||||
|
if (!Character.isWhitespace(character)) {
|
||||||
|
lbInBoundary.remove(i + 1);
|
||||||
|
lbInBoundary.add(i + 2);
|
||||||
|
sb.append(character);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sb.append("\n");
|
||||||
|
} else {
|
||||||
|
sb.append(character);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||||
List<Integer> lineBreaks,
|
List<Integer> lineBreaks,
|
||||||
List<Boundary> boldTextBoundaries,
|
List<TextRange> boldTextBoundaries,
|
||||||
List<Boundary> italicTextBoundaries,
|
List<TextRange> italicTextBoundaries,
|
||||||
List<Rectangle2D> positions,
|
List<Rectangle2D> positions,
|
||||||
List<Integer> stringIdxToPositionIdx,
|
List<Integer> stringIdxToPositionIdx,
|
||||||
long idx,
|
long idx,
|
||||||
@ -89,7 +125,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
.italicTextBoundaries(italicTextBoundaries)
|
.italicTextBoundaries(italicTextBoundaries)
|
||||||
.positions(positions)
|
.positions(positions)
|
||||||
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
||||||
.boundary(new Boundary(offset, offset + searchText.length()))
|
.textRange(new TextRange(offset, offset + searchText.length()))
|
||||||
.textDirection(textDirection)
|
.textDirection(textDirection)
|
||||||
.orientation(orientation)
|
.orientation(orientation)
|
||||||
.build();
|
.build();
|
||||||
@ -100,7 +136,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
|
|
||||||
return AtomicTextBlock.builder()
|
return AtomicTextBlock.builder()
|
||||||
.id(textBlockIdx)
|
.id(textBlockIdx)
|
||||||
.boundary(new Boundary(stringOffset, stringOffset))
|
.textRange(new TextRange(stringOffset, stringOffset))
|
||||||
.searchText("")
|
.searchText("")
|
||||||
.page(page)
|
.page(page)
|
||||||
.numberOnPage(numberOnPage)
|
.numberOnPage(numberOnPage)
|
||||||
@ -118,7 +154,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
.id(documentTextData.getId())
|
.id(documentTextData.getId())
|
||||||
.numberOnPage(documentTextData.getNumberOnPage())
|
.numberOnPage(documentTextData.getNumberOnPage())
|
||||||
.page(page)
|
.page(page)
|
||||||
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
|
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
|
||||||
.searchText(documentTextData.getSearchText())
|
.searchText(documentTextData.getSearchText())
|
||||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
||||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
||||||
@ -140,11 +176,11 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||||
}
|
}
|
||||||
if (lineNumber == 0) {
|
if (lineNumber == 0) {
|
||||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
|
||||||
} else if (lineNumber == numberOfLines() - 1) {
|
} else if (lineNumber == numberOfLines() - 1) {
|
||||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
|
||||||
}
|
}
|
||||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -159,9 +195,9 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
public int getNextLinebreak(int fromIndex) {
|
public int getNextLinebreak(int fromIndex) {
|
||||||
|
|
||||||
return lineBreaks.stream()//
|
return lineBreaks.stream()//
|
||||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
|
||||||
.findFirst() //
|
.findFirst() //
|
||||||
.orElse(searchText.length()) + boundary.start();
|
.orElse(searchText.length()) + textRange.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -169,43 +205,43 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
public int getPreviousLinebreak(int fromIndex) {
|
public int getPreviousLinebreak(int fromIndex) {
|
||||||
|
|
||||||
return lineBreaks.stream()//
|
return lineBreaks.stream()//
|
||||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
|
||||||
.reduce((a, b) -> b)//
|
.reduce((a, b) -> b)//
|
||||||
.orElse(0) + boundary.start();
|
.orElse(0) + textRange.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Rectangle2D getPosition(int stringIdx) {
|
public Rectangle2D getPosition(int stringIdx) {
|
||||||
|
|
||||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||||
|
|
||||||
if (!containsBoundary(stringBoundary)) {
|
if (!containsBoundary(stringTextRange)) {
|
||||||
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
|
||||||
}
|
}
|
||||||
if (stringBoundary.length() == 0) {
|
if (stringTextRange.length() == 0) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
|
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
|
||||||
|
|
||||||
if (stringBoundary.end() == this.boundary.end()) {
|
if (stringTextRange.end() == this.textRange.end()) {
|
||||||
return positions.subList(startPositionIdx, positions.size());
|
return positions.subList(startPositionIdx, positions.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||||
|
|
||||||
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
|
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
|
||||||
.stream()
|
.stream()
|
||||||
.map(this::getPositions)
|
.map(this::getPositions)
|
||||||
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
||||||
@ -217,9 +253,9 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
|
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
||||||
|
|
||||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
|
return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
List<AtomicTextBlock> atomicTextBlocks;
|
List<AtomicTextBlock> atomicTextBlocks;
|
||||||
String searchText;
|
String searchText;
|
||||||
Boundary boundary;
|
TextRange textRange;
|
||||||
|
|
||||||
|
|
||||||
public static ConcatenatedTextBlock empty() {
|
public static ConcatenatedTextBlock empty() {
|
||||||
@ -37,29 +37,30 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
this.atomicTextBlocks = new LinkedList<>();
|
this.atomicTextBlocks = new LinkedList<>();
|
||||||
if (atomicTextBlocks.isEmpty()) {
|
if (atomicTextBlocks.isEmpty()) {
|
||||||
boundary = new Boundary(-1, -1);
|
textRange = new TextRange(-1, -1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
var firstTextBlock = atomicTextBlocks.get(0);
|
var firstTextBlock = atomicTextBlocks.get(0);
|
||||||
this.atomicTextBlocks.add(firstTextBlock);
|
this.atomicTextBlocks.add(firstTextBlock);
|
||||||
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
|
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
|
||||||
|
|
||||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
atomicTextBlocks.subList(1, atomicTextBlocks.size())
|
||||||
|
.forEach(this::concat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||||
|
|
||||||
int start = textBlock.getBoundary().start();
|
int start = textBlock.getTextRange().start();
|
||||||
int end = textBlock.getBoundary().end();
|
int end = textBlock.getTextRange().end();
|
||||||
if (this.atomicTextBlocks.isEmpty()) {
|
if (this.atomicTextBlocks.isEmpty()) {
|
||||||
boundary.setStart(start);
|
textRange.setStart(start);
|
||||||
boundary.setEnd(end);
|
textRange.setEnd(end);
|
||||||
} else if (boundary.end() != start) {
|
} else if (textRange.end() != start) {
|
||||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
|
||||||
}
|
}
|
||||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||||
boundary.setEnd(end);
|
textRange.setEnd(end);
|
||||||
this.searchText = null;
|
this.searchText = null;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
return atomicTextBlocks.stream()
|
||||||
|
.filter(textBlock -> textBlock.getTextRange().contains(stringIdx))
|
||||||
|
.findAny()
|
||||||
|
.orElseThrow(IndexOutOfBoundsException::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
|
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
|
return atomicTextBlocks.stream()
|
||||||
|
.filter(tb -> tb.getTextRange().intersects(textRange))
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
@Override
|
@Override
|
||||||
public int numberOfLines() {
|
public int numberOfLines() {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
return atomicTextBlocks.stream()
|
||||||
|
.map(AtomicTextBlock::getLineBreaks)
|
||||||
|
.mapToInt(List::size).sum();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
@Override
|
@Override
|
||||||
public List<Integer> getLineBreaks() {
|
public List<Integer> getLineBreaks() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
|
return getAtomicTextBlocks().stream()
|
||||||
|
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
|
||||||
|
.stream())
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -125,47 +136,48 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||||
|
|
||||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||||
|
|
||||||
if (textBlocks.size() == 1) {
|
if (textBlocks.size() == 1) {
|
||||||
return textBlocks.get(0).getPositions(stringBoundary);
|
return textBlocks.get(0).getPositions(stringTextRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||||
|
|
||||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||||
positions.addAll(textBlock.getPositions());
|
positions.addAll(textBlock.getPositions());
|
||||||
}
|
}
|
||||||
|
|
||||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||||
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||||
|
|
||||||
return positions;
|
return positions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||||
|
|
||||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||||
|
|
||||||
if (textBlocks.size() == 1) {
|
if (textBlocks.size() == 1) {
|
||||||
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
|
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
|
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
|
||||||
|
|
||||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
|
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
|
||||||
}
|
}
|
||||||
|
|
||||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
||||||
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
|
||||||
|
stringTextRange.end())));
|
||||||
|
|
||||||
return rectanglesPerLinePerPage;
|
return rectanglesPerLinePerPage;
|
||||||
}
|
}
|
||||||
@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
||||||
|
|
||||||
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
||||||
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
|
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
|
||||||
|
rectangles,
|
||||||
|
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
|
||||||
|
.toList()));
|
||||||
return mergedMap;
|
return mergedMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
|
||||||
|
|
||||||
|
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||||
|
|
||||||
|
if (textBlocks.size() == 1) {
|
||||||
|
return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||||
|
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||||
|
|
||||||
|
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||||
|
sb.append(textBlock.searchTextWithLineBreaks());
|
||||||
|
}
|
||||||
|
|
||||||
|
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||||
|
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
@ -187,16 +230,22 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Boundary> getBoldTextBoundaries() {
|
public List<TextRange> getBoldTextBoundaries() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
return getAtomicTextBlocks().stream()
|
||||||
|
.map(AtomicTextBlock::getBoldTextBoundaries)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Boundary> getItalicTextBoundaries() {
|
public List<TextRange> getItalicTextBoundaries() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
return getAtomicTextBlocks().stream()
|
||||||
|
.map(AtomicTextBlock::getItalicTextBoundaries)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
|
|
||||||
public interface TextBlock extends CharSequence {
|
public interface TextBlock extends CharSequence {
|
||||||
@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence {
|
|||||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||||
|
|
||||||
|
|
||||||
List<Boundary> getBoldTextBoundaries();
|
List<TextRange> getBoldTextBoundaries();
|
||||||
|
|
||||||
|
|
||||||
List<Boundary> getItalicTextBoundaries();
|
List<TextRange> getItalicTextBoundaries();
|
||||||
|
|
||||||
|
|
||||||
String getOrientation();
|
String getOrientation();
|
||||||
@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence {
|
|||||||
int getTextDirection();
|
int getTextDirection();
|
||||||
|
|
||||||
|
|
||||||
Boundary getBoundary();
|
TextRange getTextRange();
|
||||||
|
|
||||||
|
|
||||||
int getNextLinebreak(int fromIndex);
|
int getNextLinebreak(int fromIndex);
|
||||||
@ -48,31 +48,41 @@ public interface TextBlock extends CharSequence {
|
|||||||
Rectangle2D getPosition(int stringIdx);
|
Rectangle2D getPosition(int stringIdx);
|
||||||
|
|
||||||
|
|
||||||
List<Rectangle2D> getPositions(Boundary stringBoundary);
|
List<Rectangle2D> getPositions(TextRange stringTextRange);
|
||||||
|
|
||||||
|
|
||||||
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
|
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
|
||||||
|
|
||||||
|
|
||||||
int numberOfLines();
|
int numberOfLines();
|
||||||
|
|
||||||
|
|
||||||
|
String subSequenceWithLineBreaks(TextRange stringTextRange);
|
||||||
|
|
||||||
|
|
||||||
|
default String searchTextWithLineBreaks() {
|
||||||
|
|
||||||
|
return subSequenceWithLineBreaks(getTextRange());
|
||||||
|
}
|
||||||
|
|
||||||
default int indexOf(String searchTerm) {
|
default int indexOf(String searchTerm) {
|
||||||
|
|
||||||
return indexOf(searchTerm, getBoundary().start());
|
return indexOf(searchTerm, getTextRange().start());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default Set<Page> getPages() {
|
default Set<Page> getPages() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
|
return getAtomicTextBlocks().stream()
|
||||||
|
.map(AtomicTextBlock::getPage)
|
||||||
|
.collect(Collectors.toUnmodifiableSet());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default Set<Page> getPages(Boundary boundary) {
|
default Set<Page> getPages(TextRange textRange) {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream()
|
return getAtomicTextBlocks().stream()
|
||||||
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
|
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
|
||||||
.map(AtomicTextBlock::getPage)
|
.map(AtomicTextBlock::getPage)
|
||||||
.collect(Collectors.toUnmodifiableSet());
|
.collect(Collectors.toUnmodifiableSet());
|
||||||
}
|
}
|
||||||
@ -80,38 +90,38 @@ public interface TextBlock extends CharSequence {
|
|||||||
|
|
||||||
default int indexOf(String searchTerm, int startOffset) {
|
default int indexOf(String searchTerm, int startOffset) {
|
||||||
|
|
||||||
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
|
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
|
||||||
if (start == -1) {
|
if (start == -1) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
return start + getBoundary().start();
|
return start + getTextRange().start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default CharSequence getFirstLine() {
|
default CharSequence getFirstLine() {
|
||||||
|
|
||||||
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
|
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default boolean containsBoundary(Boundary boundary) {
|
default boolean containsBoundary(TextRange textRange) {
|
||||||
|
|
||||||
if (boundary.end() < boundary.start()) {
|
if (textRange.end() < textRange.start()) {
|
||||||
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
|
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
|
||||||
}
|
}
|
||||||
return getBoundary().contains(boundary);
|
return getTextRange().contains(textRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default boolean containsIndex(int stringIndex) {
|
default boolean containsIndex(int stringIndex) {
|
||||||
|
|
||||||
return getBoundary().contains(stringIndex);
|
return getTextRange().contains(stringIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default CharSequence subSequence(Boundary boundary) {
|
default CharSequence subSequence(TextRange textRange) {
|
||||||
|
|
||||||
return subSequence(boundary.start(), boundary.end());
|
return subSequence(textRange.start(), textRange.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -128,21 +138,21 @@ public interface TextBlock extends CharSequence {
|
|||||||
@Override
|
@Override
|
||||||
default CharSequence subSequence(int start, int end) {
|
default CharSequence subSequence(int start, int end) {
|
||||||
|
|
||||||
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
|
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
default int length() {
|
default int length() {
|
||||||
|
|
||||||
return getBoundary().length();
|
return getTextRange().length();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
default char charAt(int index) {
|
default char charAt(int index) {
|
||||||
|
|
||||||
return getSearchText().charAt(index - getBoundary().start());
|
return getSearchText().charAt(index - getTextRange().start());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto {
|
|||||||
String searchText;
|
String searchText;
|
||||||
List<Integer> lineBreaks;
|
List<Integer> lineBreaks;
|
||||||
List<Integer> stringIdxToPositionIdx;
|
List<Integer> stringIdxToPositionIdx;
|
||||||
List<Boundary> boldTextBoundaries;
|
List<TextRange> boldTextBoundaries;
|
||||||
List<Boundary> italicTextBoundaries;
|
List<TextRange> italicTextBoundaries;
|
||||||
List<Rectangle2D> positions;
|
List<Rectangle2D> positions;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
private static List<TextRange> mergeToBoundaries(List<Integer> integers) {
|
||||||
|
|
||||||
if (integers.isEmpty()) {
|
if (integers.isEmpty()) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
List<Boundary> boundaries = new LinkedList<>();
|
List<TextRange> boundaries = new LinkedList<>();
|
||||||
int start = integers.get(0);
|
int start = integers.get(0);
|
||||||
int end = integers.get(0) + 1;
|
int end = integers.get(0) + 1;
|
||||||
for (int current : integers) {
|
for (int current : integers) {
|
||||||
if (current > end + 1) {
|
if (current > end + 1) {
|
||||||
boundaries.add(new Boundary(start, end));
|
boundaries.add(new TextRange(start, end));
|
||||||
start = current;
|
start = current;
|
||||||
}
|
}
|
||||||
end = current + 1;
|
end = current + 1;
|
||||||
}
|
}
|
||||||
if (boundaries.isEmpty()) {
|
if (boundaries.isEmpty()) {
|
||||||
boundaries.add(new Boundary(start, end));
|
boundaries.add(new TextRange(start, end));
|
||||||
}
|
}
|
||||||
return boundaries;
|
return boundaries;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -116,8 +116,8 @@ public class DocumentDataMapper {
|
|||||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||||
.searchText(atomicTextBlock.getSearchText())
|
.searchText(atomicTextBlock.getSearchText())
|
||||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||||
.start(atomicTextBlock.getBoundary().start())
|
.start(atomicTextBlock.getTextRange().start())
|
||||||
.end(atomicTextBlock.getBoundary().end())
|
.end(atomicTextBlock.getTextRange().end())
|
||||||
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
@ -82,15 +82,15 @@ public class TaasDocumentDataMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Range toRange(Boundary boundary) {
|
private static Range toRange(TextRange textRange) {
|
||||||
|
|
||||||
return new Range(boundary.start(), boundary.end());
|
return new Range(textRange.start(), textRange.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Range> toRange(List<Boundary> boundary) {
|
private static List<Range> toRange(List<TextRange> textRange) {
|
||||||
|
|
||||||
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
|
return textRange.stream().map(TaasDocumentDataMapper::toRange).toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,71 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
|
||||||
|
|
||||||
class BoundaryTest {
|
|
||||||
|
|
||||||
Boundary startBoundary;
|
|
||||||
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
void setUp() {
|
|
||||||
|
|
||||||
startBoundary = new Boundary(10, 100);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testContains() {
|
|
||||||
|
|
||||||
assertTrue(startBoundary.contains(11));
|
|
||||||
assertTrue(startBoundary.contains(50));
|
|
||||||
assertFalse(startBoundary.contains(9));
|
|
||||||
assertFalse(startBoundary.contains(100));
|
|
||||||
assertFalse(startBoundary.contains(150));
|
|
||||||
assertFalse(startBoundary.contains(-123));
|
|
||||||
assertTrue(startBoundary.contains(new Boundary(11, 99)));
|
|
||||||
assertTrue(startBoundary.contains(new Boundary(10, 100)));
|
|
||||||
assertTrue(startBoundary.contains(new Boundary(11, 11)));
|
|
||||||
assertFalse(startBoundary.contains(9, 100));
|
|
||||||
assertTrue(startBoundary.contains(100, 100));
|
|
||||||
assertFalse(startBoundary.contains(100, 101));
|
|
||||||
assertFalse(startBoundary.contains(150, 151));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testIntersects() {
|
|
||||||
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(1, 11)));
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(11, 12)));
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(11, 100)));
|
|
||||||
assertFalse(startBoundary.intersects(new Boundary(100, 101)));
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(99, 101)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testSplit() {
|
|
||||||
|
|
||||||
assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size());
|
|
||||||
assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90)));
|
|
||||||
assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40)));
|
|
||||||
assertEquals(1, startBoundary.split(Collections.emptyList()).size());
|
|
||||||
assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size());
|
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0)));
|
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100)));
|
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100)));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -0,0 +1,71 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
|
||||||
|
class TextRangeTest {
|
||||||
|
|
||||||
|
TextRange startTextRange;
|
||||||
|
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
|
||||||
|
startTextRange = new TextRange(10, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testContains() {
|
||||||
|
|
||||||
|
assertTrue(startTextRange.contains(11));
|
||||||
|
assertTrue(startTextRange.contains(50));
|
||||||
|
assertFalse(startTextRange.contains(9));
|
||||||
|
assertFalse(startTextRange.contains(100));
|
||||||
|
assertFalse(startTextRange.contains(150));
|
||||||
|
assertFalse(startTextRange.contains(-123));
|
||||||
|
assertTrue(startTextRange.contains(new TextRange(11, 99)));
|
||||||
|
assertTrue(startTextRange.contains(new TextRange(10, 100)));
|
||||||
|
assertTrue(startTextRange.contains(new TextRange(11, 11)));
|
||||||
|
assertFalse(startTextRange.contains(9, 100));
|
||||||
|
assertTrue(startTextRange.contains(100, 100));
|
||||||
|
assertFalse(startTextRange.contains(100, 101));
|
||||||
|
assertFalse(startTextRange.contains(150, 151));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testIntersects() {
|
||||||
|
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(1, 11)));
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(11, 12)));
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(11, 100)));
|
||||||
|
assertFalse(startTextRange.intersects(new TextRange(100, 101)));
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(99, 101)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testSplit() {
|
||||||
|
|
||||||
|
assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size());
|
||||||
|
assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90)));
|
||||||
|
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
|
||||||
|
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
|
||||||
|
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
|
||||||
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
|
||||||
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
|
||||||
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -105,7 +105,7 @@ public abstract class AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||||
|
|
||||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||||
return LayoutParsingRequest.builder()
|
return LayoutParsingRequest.builder()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user