From e68869495a294ead280f749c780b42b26efa6401 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 8 Jul 2024 13:38:40 +0200 Subject: [PATCH] Clari-002: render document data as markdown --- .../markdown/DocumentDataParser.java | 190 ++++++++++++++++++ .../graph/{Boundary.java => TextRange.java} | 38 ++-- .../model/graph/entity/RedactionEntity.java | 18 +- .../model/graph/nodes/SemanticNode.java | 20 +- .../graph/textblock/AtomicTextBlock.java | 94 ++++++--- .../textblock/ConcatenatedTextBlock.java | 115 ++++++++--- .../model/graph/textblock/TextBlock.java | 56 +++--- .../SearchTextWithTextPositionDto.java | 6 +- .../SearchTextWithTextPositionFactory.java | 10 +- .../services/mapper/DocumentDataMapper.java | 4 +- .../mapper/TaasDocumentDataMapper.java | 10 +- .../server/graph/BoundaryTest.java | 71 ------- .../server/graph/TextRangeTest.java | 71 +++++++ 13 files changed, 494 insertions(+), 209 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/{Boundary.java => TextRange.java} (64%) delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java new file mode 100644 index 0000000..60645dd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java @@ -0,0 +1,190 @@ +package com.knecon.fforesight.service.layoutparser.processor.markdown; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Stream; + +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.node.Block; +import org.commonmark.node.CustomBlock; +import org.commonmark.node.Document; +import org.commonmark.node.Emphasis; +import org.commonmark.node.Heading; +import org.commonmark.node.Node; +import org.commonmark.node.StrongEmphasis; +import org.commonmark.node.Text; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; + +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class DocumentDataParser { + + Document document; + + + public Document parse(Stream semanticNodes) { + + semanticNodes.forEach(this::parseNode); + return document; + } + + + private void parseNode(SemanticNode semanticNode) { + + switch (semanticNode.getType()) { + case HEADLINE -> parseHeadline((Headline) semanticNode); + case PARAGRAPH -> parseParagraph((Paragraph) semanticNode); + case TABLE -> parseTable((Table) semanticNode); + } + } + + + private void parseTable(Table table) { + + CustomBlock tableNode = new TableBlock(); + + document.appendChild(tableNode); + } + + + private void parseParagraph(Paragraph paragraph) { + + org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph(); + parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild); + document.appendChild(heading); + } + + + private void parseHeadline(Headline headline) { + + Heading heading = new Heading(); + parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild); + document.appendChild(heading); + + } + + + private List parseTextBlock(TextBlock textBlock) { + + List result = new ArrayList<>(); + List textRanges = mergeTextStyles(textBlock); + for (TextRangeWithTextType textRange : textRanges) { + switch (textRange.fontStyle()) { + case REGULAR -> result.add(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + case BOLD -> { + StrongEmphasis boldBlock = new StrongEmphasis(); + boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + result.add(boldBlock); + } + case ITALIC -> { + Emphasis italicBlock = new Emphasis(); + italicBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + result.add(italicBlock); + } + case BOLD_ITALIC -> { + Emphasis italicBlock = new Emphasis(); + + StrongEmphasis boldBlock = new StrongEmphasis(); + boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + + italicBlock.appendChild(boldBlock); + result.add(italicBlock); + } + } + } + return result; + } + + + private List mergeTextStyles(TextBlock textBlock) { + + List result = new ArrayList<>(); + + TreeMap> styleChanges = new TreeMap<>(); + + for (TextRange bold : textBlock.getBoldTextBoundaries()) { + styleChanges.computeIfAbsent(bold.start(), k -> new HashSet<>()).add(FontStyle.BOLD); + styleChanges.computeIfAbsent(bold.end(), k -> new HashSet<>()).add(FontStyle.REGULAR); + } + + for (TextRange italic : textBlock.getItalicTextBoundaries()) { + styleChanges.computeIfAbsent(italic.start(), k -> new HashSet<>()).add(FontStyle.ITALIC); + styleChanges.computeIfAbsent(italic.end(), k -> new HashSet<>()).add(FontStyle.REGULAR); + } + + if (styleChanges.isEmpty()) { + result.add(new TextRangeWithTextType(new TextRange(0, textBlock.length()), FontStyle.REGULAR)); + return result; + } + + int start = 0; + Set currentStyles = new HashSet<>(); + currentStyles.add(FontStyle.REGULAR); + + for (Map.Entry> entry : styleChanges.entrySet()) { + int point = entry.getKey(); + Set changes = entry.getValue(); + + if (point > start) { + FontStyle style = determineFontStyle(currentStyles); + result.add(new TextRangeWithTextType(new TextRange(start, point), style)); + } + + currentStyles.removeAll(changes); + currentStyles.addAll(changes); + if (currentStyles.isEmpty()) { + currentStyles.add(FontStyle.REGULAR); + } + + start = point; + } + + if (start < textBlock.length()) { + FontStyle style = determineFontStyle(currentStyles); + result.add(new TextRangeWithTextType(new TextRange(start, textBlock.length()), style)); + } + + return result; + } + + + private FontStyle determineFontStyle(Set styles) { + + if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) { + return FontStyle.BOLD_ITALIC; + } else if (styles.contains(FontStyle.BOLD)) { + return FontStyle.BOLD; + } else if (styles.contains(FontStyle.ITALIC)) { + return FontStyle.ITALIC; + } else { + return FontStyle.REGULAR; + } + } + + + enum FontStyle { + REGULAR, + BOLD, + ITALIC, + BOLD_ITALIC; + } + + record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java similarity index 64% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java index 82b5275..62d41a9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java @@ -13,13 +13,13 @@ import lombok.Setter; @Setter @EqualsAndHashCode @SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName") -public class Boundary implements Comparable { +public class TextRange implements Comparable { private int start; private int end; - public Boundary(int start, int end) { + public TextRange(int start, int end) { if (start > end) { throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); @@ -47,15 +47,15 @@ public class Boundary implements Comparable { } - public boolean contains(Boundary boundary) { + public boolean contains(TextRange textRange) { - return start <= boundary.start() && boundary.end() <= end; + return start <= textRange.start() && textRange.end() <= end; } - public boolean containedBy(Boundary boundary) { + public boolean containedBy(TextRange textRange) { - return boundary.contains(this); + return textRange.contains(this); } @@ -83,18 +83,18 @@ public class Boundary implements Comparable { } - public boolean intersects(Boundary boundary) { + public boolean intersects(TextRange textRange) { - return boundary.start() < this.end && this.start < boundary.end(); + return textRange.start() < this.end && this.start < textRange.end(); } - public List split(List splitIndices) { + public List split(List splitIndices) { if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) { throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); } - List splitBoundaries = new LinkedList<>(); + List splitBoundaries = new LinkedList<>(); int previousIndex = start; for (int splitIndex : splitIndices) { @@ -102,10 +102,10 @@ public class Boundary implements Comparable { if (splitIndex == previousIndex) { continue; } - splitBoundaries.add(new Boundary(previousIndex, splitIndex)); + splitBoundaries.add(new TextRange(previousIndex, splitIndex)); previousIndex = splitIndex; } - splitBoundaries.add(new Boundary(previousIndex, end)); + splitBoundaries.add(new TextRange(previousIndex, end)); return splitBoundaries; } @@ -114,11 +114,11 @@ public class Boundary implements Comparable { return IntStream.range(start, end); } - public static Boundary merge(Collection boundaries) { + public static TextRange merge(Collection boundaries) { - int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new); - int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new); - return new Boundary(minStart, maxEnd); + int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new); + int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new); + return new TextRange(minStart, maxEnd); } @@ -130,12 +130,12 @@ public class Boundary implements Comparable { @Override - public int compareTo(Boundary boundary) { + public int compareTo(TextRange textRange) { - if (end < boundary.end() && start < boundary.start()) { + if (end < textRange.end() && start < textRange.start()) { return -1; } - if (start > boundary.start() && end > boundary.end()) { + if (start > textRange.start() && end > textRange.end()) { return 1; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java index bfa9f9b..09d00df 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java @@ -11,7 +11,7 @@ import java.util.Map; import java.util.Set; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; @@ -32,7 +32,7 @@ public class RedactionEntity { // initial values @EqualsAndHashCode.Include - final Boundary boundary; + final TextRange textRange; @EqualsAndHashCode.Include final String type; @EqualsAndHashCode.Include @@ -66,9 +66,9 @@ public class RedactionEntity { SemanticNode deepestFullyContainingNode; - public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) { + public static RedactionEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) { - return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build(); + return RedactionEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build(); } @@ -132,7 +132,7 @@ public class RedactionEntity { public List getRedactionPositionsPerPage() { if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) { - Map> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary); + Map> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange); Page firstPage = rectanglesPerLinePerPage.keySet() .stream() @@ -157,19 +157,19 @@ public class RedactionEntity { public boolean containedBy(RedactionEntity redactionEntity) { - return this.boundary.containedBy(redactionEntity.getBoundary()); + return this.textRange.containedBy(redactionEntity.getTextRange()); } public boolean contains(RedactionEntity redactionEntity) { - return this.boundary.contains(redactionEntity.getBoundary()); + return this.textRange.contains(redactionEntity.getTextRange()); } public boolean intersects(RedactionEntity redactionEntity) { - return this.boundary.intersects(redactionEntity.getBoundary()); + return this.textRange.intersects(redactionEntity.getTextRange()); } @@ -210,7 +210,7 @@ public class RedactionEntity { sb.append("Entity[\""); sb.append(value); sb.append("\", "); - sb.append(boundary); + sb.append(textRange); sb.append(", pages["); pages.forEach(page -> { sb.append(page.getNumber()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index f26289f..9fc8297 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -14,7 +14,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; @@ -82,12 +82,12 @@ public interface SemanticNode { * * @return Set of PageNodes this node appears on. */ - default Set getPages(Boundary boundary) { + default Set getPages(TextRange textRange) { - if (!getBoundary().contains(boundary)) { - throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary())); + if (!getBoundary().contains(textRange)) { + throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary())); } - return getTextBlock().getPages(boundary); + return getTextBlock().getPages(textRange); } @@ -344,13 +344,13 @@ public interface SemanticNode { default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) { TextBlock textBlock = getTextBlock(); - if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) { - if (textBlock.containsBoundary(redactionEntity.getBoundary())) { + if (textBlock.getTextRange().intersects(redactionEntity.getTextRange())) { + if (textBlock.containsBoundary(redactionEntity.getTextRange())) { redactionEntity.setDeepestFullyContainingNode(this); } redactionEntity.addIntersectingNode(this); - streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary())) + streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getTextRange())) .forEach(node -> node.addThisToEntityIfIntersects(redactionEntity)); } } @@ -425,9 +425,9 @@ public interface SemanticNode { * * @return Boundary of this Node's TextBlock */ - default Boundary getBoundary() { + default TextRange getBoundary() { - return getTextBlock().getBoundary(); + return getTextBlock().getTextRange(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java index 8d393fd..6fbc13e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java @@ -10,10 +10,12 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; @@ -36,14 +38,14 @@ public class AtomicTextBlock implements TextBlock { Page page; //string coordinates - Boundary boundary; + TextRange textRange; String searchText; @Builder.Default List lineBreaks = new ArrayList<>(); @Builder.Default - List boldTextBoundaries = new ArrayList<>(); + List boldTextBoundaries = new ArrayList<>(); @Builder.Default - List italicTextBoundaries = new ArrayList<>(); + List italicTextBoundaries = new ArrayList<>(); String orientation; int textDirection; @@ -64,10 +66,44 @@ public class AtomicTextBlock implements TextBlock { } + @Override + public String subSequenceWithLineBreaks(TextRange stringTextRange) { + + if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) { + return ""; + } + + Set lbInBoundary = lineBreaks.stream() + .map(i -> i + stringTextRange.start()) + .filter(stringTextRange::contains) + .collect(Collectors.toSet()); + if (stringTextRange.end() == getTextRange().end()) { + lbInBoundary.add(getTextRange().end()); + } + StringBuilder sb = new StringBuilder(); + for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) { + char character = this.charAt(i); + if (lbInBoundary.contains(i + 1)) { + // always plus one, due to the linebreaks being an exclusive end index + if (!Character.isWhitespace(character)) { + lbInBoundary.remove(i + 1); + lbInBoundary.add(i + 2); + sb.append(character); + continue; + } + sb.append("\n"); + } else { + sb.append(character); + } + } + return sb.toString(); + } + + public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText, List lineBreaks, - List boldTextBoundaries, - List italicTextBoundaries, + List boldTextBoundaries, + List italicTextBoundaries, List positions, List stringIdxToPositionIdx, long idx, @@ -89,7 +125,7 @@ public class AtomicTextBlock implements TextBlock { .italicTextBoundaries(italicTextBoundaries) .positions(positions) .stringIdxToPositionIdx(stringIdxToPositionIdx) - .boundary(new Boundary(offset, offset + searchText.length())) + .textRange(new TextRange(offset, offset + searchText.length())) .textDirection(textDirection) .orientation(orientation) .build(); @@ -100,7 +136,7 @@ public class AtomicTextBlock implements TextBlock { return AtomicTextBlock.builder() .id(textBlockIdx) - .boundary(new Boundary(stringOffset, stringOffset)) + .textRange(new TextRange(stringOffset, stringOffset)) .searchText("") .page(page) .numberOnPage(numberOnPage) @@ -115,7 +151,7 @@ public class AtomicTextBlock implements TextBlock { .id(documentTextData.getId()) .numberOnPage(documentTextData.getNumberOnPage()) .page(page) - .boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd())) + .textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd())) .searchText(documentTextData.getSearchText()) .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed() .toList()) @@ -141,11 +177,11 @@ public class AtomicTextBlock implements TextBlock { throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); } if (lineNumber == 0) { - return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start()); + return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start()); } else if (lineNumber == numberOfLines() - 1) { - return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end()); + return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end()); } - return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start()); + return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start()); } @@ -160,9 +196,9 @@ public class AtomicTextBlock implements TextBlock { public int getNextLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak > fromIndex - boundary.start()) // + .filter(linebreak -> linebreak > fromIndex - textRange.start()) // .findFirst() // - .orElse(searchText.length()) + boundary.start(); + .orElse(searchText.length()) + textRange.start(); } @@ -170,43 +206,43 @@ public class AtomicTextBlock implements TextBlock { public int getPreviousLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak <= fromIndex - boundary.start())// + .filter(linebreak -> linebreak <= fromIndex - textRange.start())// .reduce((a, b) -> b)// - .orElse(0) + boundary.start(); + .orElse(0) + textRange.start(); } @Override public Rectangle2D getPosition(int stringIdx) { - return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start())); + return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start())); } @Override - public List getPositions(Boundary stringBoundary) { + public List getPositions(TextRange stringTextRange) { - if (!containsBoundary(stringBoundary)) { - throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary)); + if (!containsBoundary(stringTextRange)) { + throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange)); } - if (stringBoundary.length() == 0) { + if (stringTextRange.length() == 0) { return Collections.emptyList(); } - int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()); + int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start()); - if (stringBoundary.end() == this.boundary.end()) { + if (stringTextRange.end() == this.textRange.end()) { return positions.subList(startPositionIdx, positions.size()); } - return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start())); + return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start())); } - public Map> getPositionsPerPage(Boundary stringBoundary) { + public Map> getPositionsPerPage(TextRange stringTextRange) { - List rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary)) + List rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange)) .stream() .map(this::getPositions) .map(RectangleTransformations::rectangleBBoxWithGaps) @@ -218,11 +254,11 @@ public class AtomicTextBlock implements TextBlock { } - private List getAllLineBreaksInBoundary(Boundary boundary) { + private List getAllLineBreaksInBoundary(TextRange textRange) { return getLineBreaks().stream() - .map(linebreak -> linebreak + this.boundary.start()) - .filter(boundary::contains) + .map(linebreak -> linebreak + this.textRange.start()) + .filter(textRange::contains) .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java index d48170b..c835658 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java @@ -11,7 +11,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import lombok.AccessLevel; @@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock { List atomicTextBlocks; String searchText; - Boundary boundary; + TextRange textRange; public static ConcatenatedTextBlock empty() { @@ -37,29 +37,30 @@ public class ConcatenatedTextBlock implements TextBlock { this.atomicTextBlocks = new LinkedList<>(); if (atomicTextBlocks.isEmpty()) { - boundary = new Boundary(-1, -1); + textRange = new TextRange(-1, -1); return; } var firstTextBlock = atomicTextBlocks.get(0); this.atomicTextBlocks.add(firstTextBlock); - boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end()); + textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end()); - atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat); + atomicTextBlocks.subList(1, atomicTextBlocks.size()) + .forEach(this::concat); } public ConcatenatedTextBlock concat(TextBlock textBlock) { - int start = textBlock.getBoundary().start(); - int end = textBlock.getBoundary().end(); + int start = textBlock.getTextRange().start(); + int end = textBlock.getTextRange().end(); if (this.atomicTextBlocks.isEmpty()) { - boundary.setStart(start); - boundary.setEnd(end); - } else if (boundary.end() != start) { - throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); + textRange.setStart(start); + textRange.setEnd(end); + } else if (textRange.end() != start) { + throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange())); } this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); - boundary.setEnd(end); + textRange.setEnd(end); this.searchText = null; return this; } @@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock { private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) { - return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new); + return atomicTextBlocks.stream() + .filter(textBlock -> textBlock.getTextRange().contains(stringIdx)) + .findAny() + .orElseThrow(IndexOutOfBoundsException::new); } - private List getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) { + private List getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) { - return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList(); + return atomicTextBlocks.stream() + .filter(tb -> tb.getTextRange().intersects(textRange)) + .toList(); } @@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock { @Override public int numberOfLines() { - return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum(); + return atomicTextBlocks.stream() + .map(AtomicTextBlock::getLineBreaks) + .mapToInt(List::size).sum(); } @@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock { @Override public List getLineBreaks() { - return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList(); + return getAtomicTextBlocks().stream() + .flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks() + .stream()) + .toList(); } @@ -125,47 +136,48 @@ public class ConcatenatedTextBlock implements TextBlock { @Override - public List getPositions(Boundary stringBoundary) { + public List getPositions(TextRange stringTextRange) { - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); if (textBlocks.size() == 1) { - return textBlocks.get(0).getPositions(stringBoundary); + return textBlocks.get(0).getPositions(stringTextRange); } AtomicTextBlock firstTextBlock = textBlocks.get(0); - List positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()))); + List positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()))); for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { positions.addAll(textBlock.getPositions()); } var lastTextBlock = textBlocks.get(textBlocks.size() - 1); - positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); return positions; } @Override - public Map> getPositionsPerPage(Boundary stringBoundary) { + public Map> getPositionsPerPage(TextRange stringTextRange) { - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); if (textBlocks.size() == 1) { - return textBlocks.get(0).getPositionsPerPage(stringBoundary); + return textBlocks.get(0).getPositionsPerPage(stringTextRange); } AtomicTextBlock firstTextBlock = textBlocks.get(0); - Map> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())); + Map> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())); for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { - rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary())); + rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange())); } AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1); rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, - lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), + stringTextRange.end()))); return rectanglesPerLinePerPage; } @@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock { private Map> mergeEntityPositionsWithSamePageNode(Map> map1, Map> map2) { Map> mergedMap = new HashMap<>(map1); - map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList())); + map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, + rectangles, + (l1, l2) -> Stream.concat(l1.stream(), l2.stream()) + .toList())); return mergedMap; } + @Override + public String subSequenceWithLineBreaks(TextRange stringTextRange) { + + if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) { + return ""; + } + + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); + + if (textBlocks.size() == 1) { + return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange); + } + + StringBuilder sb = new StringBuilder(); + AtomicTextBlock firstTextBlock = textBlocks.get(0); + sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()))); + + for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { + sb.append(textBlock.searchTextWithLineBreaks()); + } + + var lastTextBlock = textBlocks.get(textBlocks.size() - 1); + sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); + + return sb.toString(); + } + + @Override public String toString() { @@ -187,16 +230,22 @@ public class ConcatenatedTextBlock implements TextBlock { @Override - public List getBoldTextBoundaries() { + public List getBoldTextBoundaries() { - return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList(); + return getAtomicTextBlocks().stream() + .map(AtomicTextBlock::getBoldTextBoundaries) + .flatMap(Collection::stream) + .toList(); } @Override - public List getItalicTextBoundaries() { + public List getItalicTextBoundaries() { - return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList(); + return getAtomicTextBlocks().stream() + .map(AtomicTextBlock::getItalicTextBoundaries) + .flatMap(Collection::stream) + .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java index df9c427..bef2749 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java @@ -10,7 +10,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; public interface TextBlock extends CharSequence { @@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence { List getAtomicTextBlocks(); - List getBoldTextBoundaries(); + List getBoldTextBoundaries(); - List getItalicTextBoundaries(); + List getItalicTextBoundaries(); String getOrientation(); @@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence { int getTextDirection(); - Boundary getBoundary(); + TextRange getTextRange(); int getNextLinebreak(int fromIndex); @@ -48,31 +48,41 @@ public interface TextBlock extends CharSequence { Rectangle2D getPosition(int stringIdx); - List getPositions(Boundary stringBoundary); + List getPositions(TextRange stringTextRange); - Map> getPositionsPerPage(Boundary stringBoundary); + Map> getPositionsPerPage(TextRange stringTextRange); int numberOfLines(); + String subSequenceWithLineBreaks(TextRange stringTextRange); + + + default String searchTextWithLineBreaks() { + + return subSequenceWithLineBreaks(getTextRange()); + } + default int indexOf(String searchTerm) { - return indexOf(searchTerm, getBoundary().start()); + return indexOf(searchTerm, getTextRange().start()); } default Set getPages() { - return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet()); + return getAtomicTextBlocks().stream() + .map(AtomicTextBlock::getPage) + .collect(Collectors.toUnmodifiableSet()); } - default Set getPages(Boundary boundary) { + default Set getPages(TextRange textRange) { return getAtomicTextBlocks().stream() - .filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary)) + .filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange)) .map(AtomicTextBlock::getPage) .collect(Collectors.toUnmodifiableSet()); } @@ -80,38 +90,38 @@ public interface TextBlock extends CharSequence { default int indexOf(String searchTerm, int startOffset) { - int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start()); + int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start()); if (start == -1) { return -1; } - return start + getBoundary().start(); + return start + getTextRange().start(); } default CharSequence getFirstLine() { - return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start())); + return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start())); } - default boolean containsBoundary(Boundary boundary) { + default boolean containsBoundary(TextRange textRange) { - if (boundary.end() < boundary.start()) { - throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary)); + if (textRange.end() < textRange.start()) { + throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange)); } - return getBoundary().contains(boundary); + return getTextRange().contains(textRange); } default boolean containsIndex(int stringIndex) { - return getBoundary().contains(stringIndex); + return getTextRange().contains(stringIndex); } - default CharSequence subSequence(Boundary boundary) { + default CharSequence subSequence(TextRange textRange) { - return subSequence(boundary.start(), boundary.end()); + return subSequence(textRange.start(), textRange.end()); } @@ -128,21 +138,21 @@ public interface TextBlock extends CharSequence { @Override default CharSequence subSequence(int start, int end) { - return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start()); + return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start()); } @Override default int length() { - return getBoundary().length(); + return getTextRange().length(); } @Override default char charAt(int index) { - return getSearchText().charAt(index - getBoundary().start()); + return getSearchText().charAt(index - getTextRange().start()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java index 96118cd..48bf5de 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java @@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D; import java.util.Collections; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import lombok.AccessLevel; import lombok.Builder; @@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto { String searchText; List lineBreaks; List stringIdxToPositionIdx; - List boldTextBoundaries; - List italicTextBoundaries; + List boldTextBoundaries; + List italicTextBoundaries; List positions; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 0d9fd8f..8acdf00 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.Locale; import java.util.Objects; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory { } - private static List mergeToBoundaries(List integers) { + private static List mergeToBoundaries(List integers) { if (integers.isEmpty()) { return Collections.emptyList(); } - List boundaries = new LinkedList<>(); + List boundaries = new LinkedList<>(); int start = integers.get(0); int end = integers.get(0) + 1; for (int current : integers) { if (current > end + 1) { - boundaries.add(new Boundary(start, end)); + boundaries.add(new TextRange(start, end)); start = current; } end = current + 1; } if (boundaries.isEmpty()) { - boundaries.add(new Boundary(start, end)); + boundaries.add(new TextRange(start, end)); } return boundaries; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java index b4b20d8..3a0a076 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java @@ -116,8 +116,8 @@ public class DocumentDataMapper { .page(atomicTextBlock.getPage().getNumber().longValue()) .searchText(atomicTextBlock.getSearchText()) .numberOnPage(atomicTextBlock.getNumberOnPage()) - .start(atomicTextBlock.getBoundary().start()) - .end(atomicTextBlock.getBoundary().end()) + .start(atomicTextBlock.getTextRange().start()) + .end(atomicTextBlock.getTextRange().end()) .lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks())) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java index 1978503..c54f5ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java @@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; @@ -82,15 +82,15 @@ public class TaasDocumentDataMapper { } - private static Range toRange(Boundary boundary) { + private static Range toRange(TextRange textRange) { - return new Range(boundary.start(), boundary.end()); + return new Range(textRange.start(), textRange.end()); } - private static List toRange(List boundary) { + private static List toRange(List textRange) { - return boundary.stream().map(TaasDocumentDataMapper::toRange).toList(); + return textRange.stream().map(TaasDocumentDataMapper::toRange).toList(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java deleted file mode 100644 index 4c704f7..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.graph; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Collections; -import java.util.List; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; - -class BoundaryTest { - - Boundary startBoundary; - - - @BeforeEach - void setUp() { - - startBoundary = new Boundary(10, 100); - } - - - @Test - void testContains() { - - assertTrue(startBoundary.contains(11)); - assertTrue(startBoundary.contains(50)); - assertFalse(startBoundary.contains(9)); - assertFalse(startBoundary.contains(100)); - assertFalse(startBoundary.contains(150)); - assertFalse(startBoundary.contains(-123)); - assertTrue(startBoundary.contains(new Boundary(11, 99))); - assertTrue(startBoundary.contains(new Boundary(10, 100))); - assertTrue(startBoundary.contains(new Boundary(11, 11))); - assertFalse(startBoundary.contains(9, 100)); - assertTrue(startBoundary.contains(100, 100)); - assertFalse(startBoundary.contains(100, 101)); - assertFalse(startBoundary.contains(150, 151)); - } - - - @Test - void testIntersects() { - - assertTrue(startBoundary.intersects(new Boundary(1, 11))); - assertTrue(startBoundary.intersects(new Boundary(11, 12))); - assertTrue(startBoundary.intersects(new Boundary(11, 100))); - assertFalse(startBoundary.intersects(new Boundary(100, 101))); - assertTrue(startBoundary.intersects(new Boundary(99, 101))); - } - - - @Test - void testSplit() { - - assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size()); - assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90))); - assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40))); - assertEquals(1, startBoundary.split(Collections.emptyList()).size()); - assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size()); - assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0))); - assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100))); - assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100))); - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java new file mode 100644 index 0000000..385feb9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java @@ -0,0 +1,71 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; + +class TextRangeTest { + + TextRange startTextRange; + + + @BeforeEach + void setUp() { + + startTextRange = new TextRange(10, 100); + } + + + @Test + void testContains() { + + assertTrue(startTextRange.contains(11)); + assertTrue(startTextRange.contains(50)); + assertFalse(startTextRange.contains(9)); + assertFalse(startTextRange.contains(100)); + assertFalse(startTextRange.contains(150)); + assertFalse(startTextRange.contains(-123)); + assertTrue(startTextRange.contains(new TextRange(11, 99))); + assertTrue(startTextRange.contains(new TextRange(10, 100))); + assertTrue(startTextRange.contains(new TextRange(11, 11))); + assertFalse(startTextRange.contains(9, 100)); + assertTrue(startTextRange.contains(100, 100)); + assertFalse(startTextRange.contains(100, 101)); + assertFalse(startTextRange.contains(150, 151)); + } + + + @Test + void testIntersects() { + + assertTrue(startTextRange.intersects(new TextRange(1, 11))); + assertTrue(startTextRange.intersects(new TextRange(11, 12))); + assertTrue(startTextRange.intersects(new TextRange(11, 100))); + assertFalse(startTextRange.intersects(new TextRange(100, 101))); + assertTrue(startTextRange.intersects(new TextRange(99, 101))); + } + + + @Test + void testSplit() { + + assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size()); + assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90))); + assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40))); + assertEquals(1, startTextRange.split(Collections.emptyList()).size()); + assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size()); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0))); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100))); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100))); + } + +} \ No newline at end of file