Merge branch 'feature/RED-9139' into 'main'

RED-9319: move document to its own module

See merge request fforesight/llm-service!26
This commit is contained in:
Kilian Schüttler 2024-11-14 16:29:25 +01:00
commit 2026316694
47 changed files with 69 additions and 5006 deletions

View File

@ -13,10 +13,10 @@ extra["testcontainersVersion"] = "1.20.0"
dependencies {
implementation(project(":llm-service-api"))
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.181.0")
implementation("com.knecon.fforesight:document:4.425.0-RED9139.13-RED9139.0-RED9139.0")
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.springframework.boot:spring-boot-starter:3.1.1")
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
implementation("com.knecon.fforesight:tenant-commons:0.31.0") {
exclude(group = "com.iqser.red.commons", module = "storage-commons")
}
implementation("com.azure:azure-ai-openai:1.0.0-beta.10")

View File

@ -1,70 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.google.common.base.Functions;
public class ConsecutiveBoundaryCollector implements Collector<TextRange, List<TextRange>, List<TextRange>> {
@Override
public Supplier<List<TextRange>> supplier() {
return LinkedList::new;
}
@Override
public BiConsumer<List<TextRange>, TextRange> accumulator() {
return (existingList, boundary) -> {
if (existingList.isEmpty()) {
existingList.add(boundary);
return;
}
TextRange prevTextRange = existingList.get(existingList.size() - 1);
if (prevTextRange.end() > boundary.start()) {
throw new IllegalArgumentException(String.format("Can't concatenate %s and %s. Boundaries must be ordered!", prevTextRange, boundary));
}
if (prevTextRange.end() == boundary.start()) {
existingList.remove(existingList.size() - 1);
existingList.add(TextRange.merge(List.of(prevTextRange, boundary)));
} else {
existingList.add(boundary);
}
};
}
@Override
public BinaryOperator<List<TextRange>> combiner() {
return (list1, list2) -> {
list1.addAll(list2);
return list1;
};
}
@Override
public Function<List<TextRange>, List<TextRange>> finisher() {
return Functions.identity();
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH);
}
}

View File

@ -1,78 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.textblock.ConcatenatedTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class ConsecutiveTextBlockCollector implements Collector<TextBlock, List<ConcatenatedTextBlock>, List<TextBlock>> {
@Override
public Supplier<List<ConcatenatedTextBlock>> supplier() {
return LinkedList::new;
}
@Override
public BiConsumer<List<ConcatenatedTextBlock>, TextBlock> accumulator() {
return (existingList, textBlock) -> {
if (textBlock.isEmpty()) {
return;
}
if (existingList.isEmpty()) {
ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty();
ctb.concat(textBlock);
existingList.add(ctb);
return;
}
ConcatenatedTextBlock prevBlock = existingList.get(existingList.size() - 1);
if (prevBlock.getTextRange().end() == textBlock.getTextRange().start()) {
prevBlock.concat(textBlock);
} else {
ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty();
ctb.concat(textBlock);
existingList.add(ctb);
}
};
}
@Override
public BinaryOperator<List<ConcatenatedTextBlock>> combiner() {
return (list1, list2) -> Stream.concat(list1.stream(), list2.stream())
.toList();
}
@Override
public Function<List<ConcatenatedTextBlock>, List<TextBlock>> finisher() {
return a -> a.stream()
.map(tb -> (TextBlock) tb)
.toList();
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH);
}
}

View File

@ -1,36 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import java.io.Serializable;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentData implements Serializable {
DocumentPageProto.AllDocumentPages documentPages;
DocumentTextDataProto.AllDocumentTextData documentTextData;
DocumentPositionDataProto.AllDocumentPositionData documentPositionData;
DocumentStructureWrapper documentStructureWrapper;
public DocumentStructureProto.DocumentStructure getDocumentStructure() {
return documentStructureWrapper.getDocumentStructure();
}
}

View File

@ -1,225 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.nodes.DuplicatedParagraph;
import com.knecon.fforesight.llm.service.document.nodes.Footer;
import com.knecon.fforesight.llm.service.document.nodes.Header;
import com.knecon.fforesight.llm.service.document.nodes.Headline;
import com.knecon.fforesight.llm.service.document.nodes.Image;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.Paragraph;
import com.knecon.fforesight.llm.service.document.nodes.Section;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.llm.service.document.nodes.SuperSection;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphMapper {
public Document toDocumentGraph(DocumentData documentData) {
Document document = new Document();
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pageData.addAll(documentData.getDocumentPages().getDocumentPagesList()
.stream()
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
document.setDocumentTree(context.documentTree);
document.setPages(new HashSet<>(context.pageData));
document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount());
document.setTextBlock(document.getTextBlock());
return document;
}
private List<DocumentTree.Entry> buildEntries(List<EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
for (EntryData entryData : entries) {
List<Page> pages = entryData.getPageNumbersList()
.stream()
.map(context::getPage)
.toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SUPER_SECTION -> buildSuperSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIdsCount() > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node));
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = entryData.getTreeIdList();
entryData.getEnginesList()
.forEach(node::addEngine);
node.setTreeId(treeId);
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
} return newEntries;
}
private Headline buildHeadline(Context context) {
return Headline.builder().documentTree(context.documentTree).build();
}
private Image buildImage(Context context, Map<String, String> properties, List<Long> pageNumbers) {
assert pageNumbers.size() == 1;
Page page = context.getPage(pageNumbers.get(0));
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
}
private TableCell buildTableCell(Context context, Map<String, String> properties) {
TableCell.TableCellBuilder<?, ?> builder = TableCell.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Table buildTable(Context context, Map<String, String> properties) {
Table.TableBuilder builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Footer buildFooter(Context context) {
return Footer.builder().documentTree(context.documentTree).build();
}
private Header buildHeader(Context context) {
return Header.builder().documentTree(context.documentTree).build();
}
private Section buildSection(Context context) {
return Section.builder().documentTree(context.documentTree).build();
}
private SuperSection buildSuperSection(Context context) {
return SuperSection.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build();
}
private TextBlock toTextBlock(List<Long> atomicTextBlockIds, Context context, SemanticNode parent) {
return atomicTextBlockIds.stream()
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
.collect(new TextBlockCollector());
}
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)),
context.documentPositionData.getDocumentPositionData(Math.toIntExact(atomicTextBlockId)),
parent,
context.getPage(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)).getPage()));
}
private Page buildPage(DocumentPage p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
}
static final class Context {
private final DocumentTree documentTree;
private final List<Page> pageData;
private final AllDocumentTextData documentTextData;
private final AllDocumentPositionData documentPositionData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pageData = new ArrayList<>();
this.documentTextData = documentData.getDocumentTextData();
this.documentPositionData = documentData.getDocumentPositionData();
}
private Page getPage(Long pageIndex) {
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
return page;
}
}
}

View File

@ -1,387 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import static java.lang.String.format;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.nodes.GenericSemanticNode;
import com.knecon.fforesight.llm.service.document.nodes.NodeType;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode)
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root;
for (int id : treeId) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream()
.map(Entry::getNode);
}
/**
* Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search.
* It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes.
*
* @param treeId the treeId of the Entry whose children shall be checked.
* @param textRange The TextRange to find intersecting childNodes for.
* @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange
*/
public List<SemanticNode> findIntersectingChildNodes(List<Integer> treeId, TextRange textRange) {
List<Entry> childEntries = getEntryById(treeId).getChildren();
List<SemanticNode> intersectingChildEntries = new LinkedList<>();
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
if (startIdx < 0) {
return intersectingChildEntries;
}
for (int i = startIdx; i < childEntries.size(); i++) {
if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) {
intersectingChildEntries.add(childEntries.get(i).getNode());
} else {
break;
}
}
return intersectingChildEntries;
}
public Optional<SemanticNode> findFirstContainingChild(List<Integer> treeId, TextRange textRange) {
List<Entry> childEntries = getEntryById(treeId).getChildren();
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
if (startIdx < 0) {
return Optional.empty();
}
if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) {
return Optional.of(childEntries.get(startIdx).getNode());
}
return Optional.empty();
}
public Optional<TableCell> findTableCellInTable(List<Integer> treeId, int start, int end) {
return findTableCellInTableRecursively(getEntryById(treeId).getChildren(), start, end);
}
private Optional<TableCell> findTableCellInTableRecursively(List<Entry> entries, int start, int end) {
int startIdx = findFirstIdxOfContainingChildBinarySearch(entries, start);
if (startIdx < 0) {
return Optional.empty();
}
Entry entry = entries.get(startIdx);
if (entry.getNode().getTextRange().contains(end) && entry.getNode() instanceof TableCell tableCell) {
if (!entry.getNode().isLeaf()) {
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
if (foundInChildren.isPresent()) {
return foundInChildren;
}
}
return Optional.of(tableCell);
}
if (!entry.getNode().isLeaf()) {
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
if (foundInChildren.isPresent()) {
return foundInChildren;
}
}
return Optional.empty();
}
private int findFirstIdxOfContainingChildBinarySearch(List<Entry> childNodes, int start) {
int low = 0;
int high = childNodes.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
TextRange range = childNodes.get(mid).getNode().getTextRange();
if (range.start() > start) {
high = mid - 1;
} else if (range.end() <= start) {
low = mid + 1;
} else {
return mid;
}
}
return -1;
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream()
.filter(entry -> entry.node.getType().equals(nodeType))
.map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Optional<SemanticNode> getNextSibling(List<Integer> treeId) {
var siblingTreeId = getNextSiblingId(treeId);
if (!entryExists(siblingTreeId)) {
return Optional.empty();
}
return Optional.of(getEntryById(siblingTreeId).getNode());
}
public List<Integer> getNextSiblingId(List<Integer> treeId) {
List<Integer> siblingTreeId = new LinkedList<>();
for (int i = 0; i < treeId.size() - 1; i++) {
siblingTreeId.add(treeId.get(i));
}
siblingTreeId.add(treeId.get(treeId.size() - 1) + 1);
return siblingTreeId;
}
public Optional<SemanticNode> getPreviousSibling(List<Integer> treeId) {
var siblingTreeId = getPreviousSiblingId(treeId);
if (!entryExists(siblingTreeId)) {
return Optional.empty();
}
return Optional.of(getEntryById(siblingTreeId).getNode());
}
public List<Integer> getPreviousSiblingId(List<Integer> treeId) {
List<Integer> siblingTreeId = new LinkedList<>();
for (int i = 0; i < treeId.size() - 1; i++) {
siblingTreeId.add(treeId.get(i));
}
siblingTreeId.add(treeId.get(treeId.size() - 1) - 1);
return siblingTreeId;
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;
}
public Optional<Entry> findEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return Optional.of(root);
}
Entry entry = root;
for (int id : treeId) {
if (entry.children.size() <= id) {
return Optional.empty();
}
entry = entry.children.get(id);
}
return Optional.of(entry);
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root)
.flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream()
.flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n",
allEntriesInOrder().map(Entry::toString)
.toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new ArrayList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -1,72 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.llm.service.document.nodes.Image;
import com.knecon.fforesight.llm.service.document.nodes.ImageType;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PropertiesMapper {
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
builder.imageType(ImageType.fromString(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
builder.position(parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
builder.id(properties.get(DocumentStructureWrapper.ImageProperties.ID));
}
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
builder.bBox(parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
}
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
}
private Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(DocumentStructureWrapper.RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static List<Long> toLongList(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
.map(Long::valueOf)
.toList();
}
}

View File

@ -1,175 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
public class RectangleTransformations {
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
}
public static Rectangle2D rectangle2DBBox(Collection<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream()
.collect(new Rectangle2DBBoxCollector());
}
/**
* If two rectangles are further apart than five times the average width of a rectangle, a gap is inserted.
*
* @param rectangle2DList A list of rectangles to combine
* @return A list of rectangles which are combined if they are closer than the split threshold
*/
public static List<Rectangle2D> rectangleBBoxWithGaps(List<Rectangle2D> rectangle2DList) {
if (rectangle2DList.isEmpty()) {
return Collections.emptyList();
}
double splitThreshold = rectangle2DList.stream()
.mapToDouble(RectangularShape::getWidth).average()
.orElse(5) * 5.0;
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
rectangleListsWithGaps.add(rectangleListWithoutGaps);
Rectangle2D previousRectangle = rectangle2DList.get(0);
for (Rectangle2D currentRectangle : rectangle2DList) {
if (Math.abs(currentRectangle.getMinX() - previousRectangle.getMaxX()) > splitThreshold) {
rectangleListWithoutGaps = new LinkedList<>();
rectangleListWithoutGaps.add(currentRectangle);
rectangleListsWithGaps.add(rectangleListWithoutGaps);
previousRectangle = currentRectangle;
} else {
rectangleListWithoutGaps.add(currentRectangle);
previousRectangle = currentRectangle;
}
}
return rectangleListsWithGaps.stream()
.map(RectangleTransformations::rectangle2DBBox)
.toList();
}
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
return new Rectangle2DBBoxCollector();
}
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
@Override
public Supplier<BBox> supplier() {
return BBox::new;
}
@Override
public BiConsumer<BBox, Rectangle2D> accumulator() {
return BBox::addRectangle;
}
@Override
public BinaryOperator<BBox> combiner() {
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY));
}
@Override
public Function<BBox, Rectangle2D> finisher() {
return BBox::toRectangle2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.UNORDERED);
}
@AllArgsConstructor
@NoArgsConstructor
private static class BBox {
Double lowerLeftX;
Double lowerLeftY;
Double upperRightX;
Double upperRightY;
public Rectangle2D toRectangle2D() {
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
return new Rectangle2D.Double(0, 0, 0, 0);
}
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
}
public void addRectangle(Rectangle2D rectangle2D) {
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
if (this.lowerLeftX == null) {
this.lowerLeftX = lowerLeftX;
} else if (this.lowerLeftX > lowerLeftX) {
this.lowerLeftX = lowerLeftX;
}
if (this.lowerLeftY == null) {
this.lowerLeftY = lowerLeftY;
} else if (this.lowerLeftY > lowerLeftY) {
this.lowerLeftY = lowerLeftY;
}
if (this.upperRightX == null) {
this.upperRightX = upperRightX;
} else if (this.upperRightX < upperRightX) {
this.upperRightX = upperRightX;
}
if (this.upperRightY == null) {
this.upperRightY = upperRightY;
} else if (this.upperRightY < upperRightY) {
this.upperRightY = upperRightY;
}
}
}
}
}

View File

@ -1,250 +0,0 @@
package com.knecon.fforesight.llm.service.document;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.EqualsAndHashCode;
import lombok.Setter;
/**
* Represents a range of text defined by a start and end index.
* Provides functionality to check containment, intersection, and to adjust ranges based on specified conditions.
*/
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class TextRange implements Comparable<TextRange> {
private int start;
private int end;
/**
* Constructs a TextRange with specified start and end indexes.
*
* @param start The starting index of the range.
* @param end The ending index of the range.
* @throws IllegalArgumentException If start is greater than end.
*/
public TextRange(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
/**
* Returns the length of the text range.
*
* @return The length of the range.
*/
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
/**
* Checks if this {@link TextRange} fully contains another TextRange.
*
* @param textRange The {@link TextRange} to check.
* @return true if this range contains the specified range, false otherwise.
*/
public boolean contains(TextRange textRange) {
return start <= textRange.start() && textRange.end() <= end;
}
/**
* Checks if this {@link TextRange} is fully contained by another TextRange.
*
* @param textRange The {@link TextRange} to check against.
* @return true if this range is contained by the specified range, false otherwise.
*/
public boolean containedBy(TextRange textRange) {
return textRange.contains(this);
}
/**
* Checks if this {@link TextRange} contains another range specified by start and end indices.
*
* @param start The starting index of the range to check.
* @param end The ending index of the range to check.
* @return true if this range fully contains the specified range, false otherwise.
* @throws IllegalArgumentException If the start index is greater than the end index.
*/
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
/**
* Checks if this {@link TextRange} is fully contained within another range specified by start and end indices.
*
* @param start The starting index of the outer range.
* @param end The ending index of the outer range.
* @return true if this range is fully contained within the specified range, false otherwise.
* @throws IllegalArgumentException If the start index is greater than the end index.
*/
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
/**
* Determines if the specified index is within this {@link TextRange}.
*
* @param index The index to check.
* @return true if the index is within the range (inclusive of the start and exclusive of the end), false otherwise.
*/
public boolean contains(int index) {
return start <= index && index < end;
}
/**
* Checks if this {@link TextRange} intersects with another {@link TextRange}.
*
* @param textRange The {@link TextRange} to check for intersection.
* @return true if the ranges intersect, false otherwise.
*/
public boolean intersects(TextRange textRange) {
return textRange.start() < this.end && this.start < textRange.end();
}
/**
* Splits this TextRange into multiple ranges based on a list of indices.
*
* @param splitIndices The indices at which to split the range.
* @return A list of TextRanges resulting from the split.
* @throws IndexOutOfBoundsException If any split index is outside this TextRange.
*/
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream()
.anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
splitIndices.stream()
.filter(idx -> !this.contains(idx))
.toList(),
this));
}
List<TextRange> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) {
int splitIndex = splitIndices.get(i);
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new TextRange(previousIndex, end));
return splitBoundaries;
}
/**
* Merges a collection of TextRanges into a single Text range encompassing all.
*
* @param boundaries The collection of TextRanges to merge.
* @return A new TextRange covering the entire span of the given ranges.
* @throws IllegalArgumentException If boundaries are empty.
*/
public static TextRange merge(Collection<TextRange> boundaries) {
int minStart = boundaries.stream()
.mapToInt(TextRange::start)
.min()
.orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream()
.mapToInt(TextRange::end)
.max()
.orElseThrow(IllegalArgumentException::new);
return new TextRange(minStart, maxEnd);
}
@Override
public String toString() {
return format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(TextRange textRange) {
if (end < textRange.end() && start < textRange.start()) {
return -1;
}
if (start > textRange.start() && end > textRange.end()) {
return 1;
}
return 0;
}
/**
* Shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without trailing or preceding whitespaces.
*
* @param textBlock TextBlock to check whitespaces against
* @return Trimmed boundary
*/
public TextRange trim(TextBlock textBlock) {
if (this.length() == 0) {
return this;
}
int trimmedStart = this.start;
while (textBlock.containsIndex(trimmedStart) && trimmedStart < end && Character.isWhitespace(textBlock.charAt(trimmedStart))) {
trimmedStart++;
}
int trimmedEnd = this.end;
while (textBlock.containsIndex(trimmedEnd - 1) && trimmedStart < trimmedEnd && Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
trimmedEnd--;
}
return new TextRange(trimmedStart, Math.max(trimmedEnd, trimmedStart));
}
}

View File

@ -1,10 +0,0 @@
package com.knecon.fforesight.llm.service.document.entity;
public enum EntityType {
ENTITY,
HINT,
RECOMMENDATION,
FALSE_POSITIVE,
FALSE_RECOMMENDATION,
DICTIONARY_REMOVAL
}

View File

@ -1,30 +0,0 @@
package com.knecon.fforesight.llm.service.document.entity;
import com.knecon.fforesight.llm.service.document.TextRange;
public interface IEntity {
/**
* Gets the value of this entity as a string.
*
* @return The string value.
*/
String getValue();
/**
* Gets the range of text in the document associated with this entity.
*
* @return The text range.
*/
TextRange getTextRange();
/**
* Gets the type of this entity.
*
* @return The entity type.
*/
String type();
}

View File

@ -1,46 +0,0 @@
package com.knecon.fforesight.llm.service.document.entity;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import lombok.experimental.UtilityClass;
@UtilityClass
public final class IdBuilder {
private final HashFunction hashFunction = Hashing.murmur3_128();
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
return buildId(pages.stream()
.map(Page::getNumber)
.collect(Collectors.toList()), rectanglesPerLine, type, entityType);
}
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
StringBuilder sb = new StringBuilder();
sb.append(type).append(entityType);
List<Integer> sortedPageNumbers = pageNumbers.stream()
.sorted(Comparator.comparingInt(Integer::intValue))
.toList();
sortedPageNumbers.forEach(sb::append);
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
.append(Math.round(rectangle2D.getY()))
.append(Math.round(rectangle2D.getWidth()))
.append(Math.round(rectangle2D.getHeight())));
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
}
}

View File

@ -1,25 +0,0 @@
package com.knecon.fforesight.llm.service.document.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class PositionOnPage {
// Each entry in this list corresponds to an entry in the redaction log, this means:
// A single entity might be represented by multiple redaction log entries
// This is due to the RedactionLog only being able to handle a single page per entry.
final String id;
Page page;
List<Rectangle2D> rectanglePerLine;
}

View File

@ -1,248 +0,0 @@
package com.knecon.fforesight.llm.service.document.entity;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class TextEntity implements IEntity {
// primary key
@EqualsAndHashCode.Include
final String id;
// primary key end
TextRange textRange;
@Builder.Default
List<TextRange> duplicateTextRanges = new ArrayList<>();
String type; // TODO: make final once ManualChangesApplicationService::recategorize is deleted
final EntityType entityType;
// inferred on graph insertion
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<PositionOnPage> positionsOnPagePerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, SemanticNode node) {
return TextEntity.builder().id(buildId(node, textRange, type, entityType)).type(type).entityType(entityType).textRange(textRange).build();
}
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id) {
return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build();
}
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id, String manualOverwriteSection) {
return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build();
}
private static String buildId(SemanticNode node, TextRange textRange, String type, EntityType entityType) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = node.getPositionsPerPage(textRange);
return IdBuilder.buildId(rectanglesPerLinePerPage.keySet(),
rectanglesPerLinePerPage.values()
.stream()
.flatMap(Collection::stream)
.toList(),
type,
entityType.name());
}
public void addTextRange(TextRange textRange) {
duplicateTextRanges.add(textRange);
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream()
.anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream()
.anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return type().equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type());
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public String getValueWithLineBreaks() {
return getDeepestFullyContainingNode().getTextBlock().subSequenceWithLineBreaks(getTextRange());
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
}
public List<PositionOnPage> getPositionsOnPagePerPage() {
if (positionsOnPagePerPage == null || positionsOnPagePerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
positionsOnPagePerPage = rectanglesPerLinePerPage.entrySet()
.stream()
.map(entry -> buildPositionOnPage(firstPage, id, entry))
.toList();
}
return positionsOnPagePerPage;
}
private static PositionOnPage buildPositionOnPage(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new PositionOnPage(id, entry.getKey(), entry.getValue());
} else {
return new PositionOnPage(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(TextEntity textEntity) {
return textEntity.contains(this);
}
public boolean contains(TextEntity textEntity) {
if (this.textRange.contains(textEntity.getTextRange())) {
return true;
}
List<TextRange> textEntityDuplicateRanges = textEntity.getDuplicateTextRanges();
// use optimized indexed loops for extra performance boost
for (int i = 0, duplicateTextRangesSize = duplicateTextRanges.size(); i < duplicateTextRangesSize; i++) {
TextRange duplicateTextRange = duplicateTextRanges.get(i);
if (duplicateTextRange.contains(textEntity.getTextRange())) {
return true;
}
for (int j = 0, textEntityDuplicateRangesSize = textEntityDuplicateRanges.size(); j < textEntityDuplicateRangesSize; j++) {
TextRange otherRange = textEntityDuplicateRanges.get(j);
if (duplicateTextRange.contains(otherRange)) {
return true;
}
}
}
return false;
}
public boolean intersects(TextEntity textEntity) {
return this.textRange.intersects(textEntity.getTextRange()) //
|| duplicateTextRanges.stream()
.anyMatch(duplicateTextRange -> duplicateTextRange.intersects(textEntity.textRange)) //
|| duplicateTextRanges.stream()
.anyMatch(duplicateTextRange -> textEntity.getDuplicateTextRanges()
.stream()
.anyMatch(duplicateTextRange::intersects));
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getPositionsOnPagePerPage().stream()
.anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(textRange);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type());
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
@Override
public String type() {
return getType();
}
}

View File

@ -1,73 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngineProto.LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM));
@EqualsAndHashCode.Include
List<Integer> treeId;
TextBlock textBlock;
DocumentTree documentTree;
@Builder.Default
Set<TextEntity> entities = new HashSet<>();
Map<Page, Rectangle2D> bBoxCache;
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,171 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents the entire document as a node within the document's semantic structure.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Document extends AbstractSemanticNode {
Set<Page> pages;
Integer numberOfPages;
@Builder.Default
static final SectionIdentifier sectionIdentifier = SectionIdentifier.document();
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
/**
* Gets the sections of the document as a list.
*
* @return A list of all sections within the document.
*/
public List<Section> getAllSections() {
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
*
* @return A list of all children of type SECTION or SUPER_SECTION.
*/
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
.toList();
}
/**
* Streams all terminal (leaf) text blocks within the document in their natural order.
*
* @return A stream of terminal {@link TextBlock}.
*/
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
@Override
public SectionIdentifier getSectionIdentifier() {
return sectionIdentifier;
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(Headline::empty);
}
/**
* Streams all nodes within the document, regardless of type, in their natural order.
*
* @return A stream of all {@link SemanticNode} within the document.
*/
private Stream<SemanticNode> streamAllNodes() {
return getDocumentTree().allEntriesInOrder()
.map(DocumentTree.Entry::getNode);
}
/**
* Streams all image nodes contained within the document.
*
* @return A stream of {@link Image} nodes.
*/
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -1,35 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -1,62 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Footer extends AbstractSemanticNode {
final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty();
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public SectionIdentifier getSectionIdentifier() {
return sectionIdentifier;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
public interface GenericSemanticNode extends SemanticNode {
}

View File

@ -1,65 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents the header part of a document page.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Header extends AbstractSemanticNode {
final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty();
TextBlock leafTextBlock;
@Override
public boolean isLeaf() {
return true;
}
@Override
public NodeType getType() {
return NodeType.HEADER;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public SectionIdentifier getSectionIdentifier() {
return sectionIdentifier;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,100 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a headline in a document.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Headline extends AbstractSemanticNode {
TextBlock leafTextBlock;
SectionIdentifier sectionIdentifier;
@Override
public NodeType getType() {
return NodeType.HEADLINE;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@Override
public Headline getHeadline() {
return this;
}
@Override
public SectionIdentifier getSectionIdentifier() {
if (sectionIdentifier == null) {
sectionIdentifier = SectionIdentifier.fromSearchText(getTextBlock().getSearchText());
}
return sectionIdentifier;
}
/**
* Creates an empty headline with no text content.
*
* @return An empty {@link Headline} instance.
*/
public static Headline empty() {
return Headline.builder().leafTextBlock(AtomicTextBlock.empty(-1L, 0, new Page(), -1, null)).build();
}
/**
* Checks if this headline is associated with any paragraphs within its parent section or node.
*
* @return True if there are paragraphs associated with this headline, false otherwise.
*/
public boolean hasParagraphs() {
return getParent().streamAllSubNodesOfType(NodeType.PARAGRAPH)
.findFirst()
.isPresent();
}
}

View File

@ -1,140 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.entity.IEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents an image within the document.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Image extends AbstractSemanticNode implements IEntity {
String id;
TextBlock leafTextBlock;
ImageType imageType;
boolean transparent;
Rectangle2D position;
Page page;
@Override
public NodeType getType() {
return NodeType.IMAGE;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public Set<Page> getPages() {
return Collections.singleton(page);
}
@Override
public TextRange getTextRange() {
return leafTextBlock.getTextRange();
}
@Override
public int length() {
return getTextRange().length();
}
@Override
public String type() {
return getType().toString().toLowerCase(Locale.ENGLISH);
}
@Override
public String toString() {
return getTreeId() + ": " + getValue() + " " + position;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
@Override
public String getValue() {
return NodeType.IMAGE + ":" + camelCase(imageType.toString());
}
private String camelCase(String name) {
return name.charAt(0) + name.substring(1).toLowerCase(Locale.ENGLISH);
}
public boolean mostlyContainedBy(Image image, double containmentThreshold) {
Map<Page, Rectangle2D> bboxImage = image.getBBox();
Map<Page, Rectangle2D> bbox = this.getBBox();
//image needs to be on the same page
if (bboxImage.get(this.page) != null) {
Rectangle2D intersection = bboxImage.get(this.page).createIntersection(bbox.get(this.page));
double calculatedIntersection = intersection.getWidth() * intersection.getHeight();
double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight();
return (calculatedIntersection / area) > containmentThreshold;
}
return false;
}
public boolean mostlyContains(Image image, double containmentThreshold) {
Map<Page, Rectangle2D> bboxImage = image.getBBox();
Map<Page, Rectangle2D> bbox = this.getBBox();
Rectangle2D intersection = bbox.get(this.page).createIntersection(bboxImage.get(this.page));
double calculatedIntersection = intersection.getWidth() * intersection.getHeight();
double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight();
return (area / calculatedIntersection) > containmentThreshold;
}
}

View File

@ -1,25 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.Locale;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
OTHER,
OCR,
GRAPHIC;
public static ImageType fromString(String imageType) {
return switch (imageType.toLowerCase(Locale.ROOT)) {
case "logo" -> ImageType.LOGO;
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
case "graphic" -> ImageType.GRAPHIC;
default -> ImageType.OTHER;
};
}
}

View File

@ -1,22 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.Locale;
public enum NodeType {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ENGLISH);
}
}

View File

@ -1,94 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
/**
* Represents a single page in a document.
*/
@Getter
@Setter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Page {
@EqualsAndHashCode.Include
Integer number;
Integer height;
Integer width;
Integer rotation;
List<AtomicTextBlock> textBlocksOnPage;
Header header;
Footer footer;
@Builder.Default
Set<TextEntity> entities = new HashSet<>();
@Builder.Default
Set<Image> images = new HashSet<>();
/**
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
*
* @return The main body text block.
*/
public TextBlock getMainBodyTextBlock() {
return textBlocksOnPage.stream()
.filter(atb -> !atb.isEmpty())
.collect(new TextBlockCollector());
}
/**
* Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page.
*
* @return A list which contains the highes SemanticNodes, which appear only on this page.
*/
public List<SemanticNode> getMainBody() {
return textBlocksOnPage.stream()
.map(AtomicTextBlock::getParent)
.map(this::getHighestParentOnlyOnPage)
.distinct()
.toList();
}
private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) {
SemanticNode currentNode = node;
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
currentNode = currentNode.getParent();
}
return currentNode;
}
@Override
public String toString() {
return String.valueOf(number);
}
}

View File

@ -1,54 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a paragraph in the document.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PROTECTED)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Paragraph extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.PARAGRAPH;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -1,90 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
/**
* Represents a section within a document, encapsulating both its textual content and semantic structure.
*/
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Section extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SECTION;
}
/**
* Checks if this section contains any tables.
*
* @return True if the section contains at least one table, false otherwise.
*/
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public SectionIdentifier getSectionIdentifier() {
return getHeadline().getSectionIdentifier();
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string.
*
* @param value The string to search for within headlines, case-sensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsString(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
*
* @param value The string to search for within headlines, case-insensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
}
}

View File

@ -1,158 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
* Represents a unique identifier for a section within a document.
*/
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format {
EMPTY,
NUMERICAL,
DOCUMENT
}
Format format;
String identifierString;
List<Integer> identifiers;
boolean asChild;
/**
* Generates a SectionIdentifier from the headline text of a section, determining its format and structure.
*
* @param headline The headline text from which to generate the section identifier.
* @return A {@link SectionIdentifier} instance corresponding to the headline text.
*/
public static SectionIdentifier fromSearchText(String headline) {
if (headline == null || headline.isEmpty() || headline.isBlank()) {
return SectionIdentifier.empty();
}
Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline);
if (numericalIdentifierMatcher.find()) {
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
}
// more formats here
return SectionIdentifier.empty();
}
/**
* Marks the current section identifier as a child of another section.
*
* @param sectionIdentifier The parent section identifier.
* @return A new {@link SectionIdentifier} instance marked as a child.
*/
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
}
/**
* Generates a SectionIdentifier that represents the entire document.
*
* @return A {@link SectionIdentifier} with a document-wide scope.
*/
public static SectionIdentifier document() {
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
}
/**
* Generates an empty SectionIdentifier.
*
* @return An empty {@link SectionIdentifier} instance.
*/
public static SectionIdentifier empty() {
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
}
private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) {
String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end());
List<Integer> identifiers = new LinkedList<>();
for (int i = 1; i <= 4; i++) {
String numericalIdentifier = numericalIdentifierMatcher.group(i);
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
break;
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
}
return new SectionIdentifier(Format.NUMERICAL,
identifierString,
identifiers.stream()
.toList(),
false);
}
/**
* Determines if the current section is the parent of the given section.
*
* @param sectionIdentifier The section identifier to compare against.
* @return true if the current section is the parent of the given section, false otherwise.
*/
public boolean isParentOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.EMPTY)) {
return false;
}
if (this.format.equals(Format.DOCUMENT)) {
return true;
}
if (!this.format.equals(sectionIdentifier.format)) {
return false;
}
if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) {
return false;
}
for (int i = 0; i < this.identifiers.size(); i++) {
if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) {
return false;
}
}
return true;
}
/**
* Determines if the current section is a child of the given section, based on their identifiers.
*
* @param sectionIdentifier The section identifier to compare against.
* @return True if the current section is a child of the given section, false otherwise.
*/
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {
return false;
}
return sectionIdentifier.isParentOf(this);
}
@Override
public String toString() {
return identifierString;
}
}

View File

@ -1,684 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
public interface SemanticNode {
/**
* Returns the type of this node, such as Section, Paragraph, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
.collect(new TextBlockCollector());
}
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose TextRange intersects the TextRange of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<TextEntity> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages() {
return getTextBlock().getPages();
}
/**
* Finds the first page associated with this Node.
*
* @return Set of PageNodes this node appears on.
*/
default Page getFirstPage() {
return getTextBlock().getPages()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow();
}
/**
* Each AtomicTextBlock is assigned a page, so to get the pages for this TextRange, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages(TextRange textRange) {
if (!getTextRange().intersects(textRange)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not intersected in the %s of this node!", textRange, getTextRange()));
}
return getTextBlock().getPages(textRange);
}
/**
* Checks if the given page number exists in the list of pages.
*
* @param pageNumber the page number to be checked
* @return true if the page number exists, otherwise false
*/
default boolean onPage(int pageNumber) {
return getPages().stream()
.anyMatch(page -> page.getNumber() == pageNumber);
}
/**
* Returns the DocumentTree Object.
*
* @return the DocumentTree of the Document this node belongs to
*/
DocumentTree getDocumentTree();
/**
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
*
* @return the DocumentTree ID
*/
List<Integer> getTreeId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTreeId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
* If no Headline is found this way, it will recursively traverse the tree up and try again until it hits the root, where it will perform a BFS.
* If no Headline exists anywhere in the Document a dummy Headline is returned.
*
* @return First Headline found.
*/
default Headline getHeadline() {
return getParent().getHeadline();
}
/**
* Returns a SectionIdentifier, such that it acts as a child of the first Headline associated with this SemanticNode.
*
* @return The SectionIdentifier from the first Headline.
*/
default SectionIdentifier getSectionIdentifier() {
return SectionIdentifier.asChildOf(getHeadline().getSectionIdentifier());
}
/**
* Checks if its TreeId has a length greater than zero.
*
* @return boolean indicating whether this Node has a Parent in the DocumentTree
*/
default boolean hasParent() {
return getDocumentTree().hasParentById(getTreeId());
}
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
}
/**
* @return The SemanticNode which is directly underneath the document and also under which this node is.
* if this is the highest child node or the document itself, it returns itself.
*/
default SemanticNode getHighestParent() {
return getDocumentTree().getHighestParentById(getTreeId());
}
/**
* Returns the next sibling node of this SemanticNode in the document tree, if any.
* If there is no next sibling node, an empty Optional is returned.
*
* @return Optional containing the next sibling node, or empty if there is none
*/
default Optional<SemanticNode> getNextSibling() {
return getDocumentTree().getNextSibling(getTreeId());
}
/**
* Returns the previous sibling node of this SemanticNode in the document tree, if any.
* If there is no previous sibling node, an empty Optional is returned.
*
* @return Optional containing the previous sibling node, or empty if there is none
*/
default Optional<SemanticNode> getPreviousSibling() {
return getDocumentTree().getPreviousSibling(getTreeId());
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not leaves.
* A TableCell might be a leaf depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a TextBlock
*/
default boolean isLeaf() {
return false;
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are no leaves.
*
* @return AtomicTextBlock
*/
default TextBlock getLeafTextBlock() {
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
}
/**
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
*
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
*/
default void setLeafTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* Checks if the SemanticNode contains any text.
*
* @return true, if this node's TextBlock is not empty
*/
default boolean hasText() {
return !getTextBlock().isEmpty();
}
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
default boolean containsString(String string) {
return getTextBlock().getSearchText().contains(string);
}
Set<LayoutEngineProto.LayoutEngine> getEngines();
default void addEngine(LayoutEngineProto.LayoutEngine engine) {
getEngines().add(engine);
}
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
default boolean containsAllStrings(String... strings) {
return Arrays.stream(strings)
.allMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings to check if they are contained in the TextBlock
* @return true, if this node's TextBlock contains any of the provided strings
*/
default boolean containsAnyString(String... strings) {
return Arrays.stream(strings)
.anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream()
.anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains all the provided Strings case-insensitive.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string case-insensitive
*/
default boolean containsStringIgnoreCase(String string) {
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyStringIgnoreCase(String... strings) {
return Arrays.stream(strings)
.anyMatch(this::containsStringIgnoreCase);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAllStringsIgnoreCase(String... strings) {
return Arrays.stream(strings)
.allMatch(this::containsStringIgnoreCase);
}
/**
* Checks whether this SemanticNode contains exactly the provided String as a word.
*
* @param word - String which the TextBlock might contain
* @return true, if this node's TextBlock contains string
*/
default boolean containsWord(String word) {
return getTextBlock().getWords()
.stream()
.anyMatch(s -> s.equals(word));
}
/**
* Checks whether this SemanticNode contains exactly the provided String as a word case-insensitive.
*
* @param word - String which the TextBlock might contain
* @return true, if this node's TextBlock contains string
*/
default boolean containsWordIgnoreCase(String word) {
return getTextBlock().getWords()
.stream()
.map(String::toLowerCase)
.anyMatch(s -> s.equals(word.toLowerCase(Locale.ENGLISH)));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings as a word.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the provided strings
*/
default boolean containsAnyWord(String... words) {
return Arrays.stream(words)
.anyMatch(word -> getTextBlock().getWords()
.stream()
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings as a word case-insensitive.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the provided strings
*/
default boolean containsAnyWordIgnoreCase(String... words) {
return Arrays.stream(words)
.map(String::toLowerCase)
.anyMatch(word -> getTextBlock().getWords()
.stream()
.map(String::toLowerCase)
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode contains all the provided Strings as word.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all the provided strings
*/
default boolean containsAllWords(String... words) {
return Arrays.stream(words)
.allMatch(word -> getTextBlock().getWords()
.stream()
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode contains all the provided Strings as word case-insensitive.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all the provided strings
*/
default boolean containsAllWordsIgnoreCase(String... words) {
return Arrays.stream(words)
.map(String::toLowerCase)
.allMatch(word -> getTextBlock().getWords()
.stream()
.map(String::toLowerCase)
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode intersects the provided rectangle.
*
* @param x the lower left corner X value
* @param y the lower left corner Y value
* @param w width
* @param h height
* @param pageNumber the pageNumber of the rectangle
* @return true if intersects, false otherwise
*/
default boolean intersectsRectangle(int x, int y, int w, int h, int pageNumber) {
return getBBox().entrySet()
.stream()
.filter(entry -> entry.getKey().getNumber() == pageNumber)
.map(Map.Entry::getValue)
.anyMatch(rect -> rect.intersects(x, y, w, h));
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the TextRange of the RedactionEntity intersects or even contains the RedactionEntity.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param textEntity RedactionEntity, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(TextEntity textEntity) {
TextBlock textBlock = getTextBlock();
if (textBlock.getTextRange().intersects(textEntity.getTextRange())) {
if (textBlock.containsTextRange(textEntity.getTextRange())) {
textEntity.setDeepestFullyContainingNode(this);
}
textEntity.addIntersectingNode(this);
getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange())
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
}
}
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getDocumentTree().childNodes(getTreeId());
}
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
}
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.map(DocumentTree.Entry::getNode);
}
/**
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.filter(entry -> entry.getType().equals(nodeType))
.map(DocumentTree.Entry::getNode);
}
/**
* The TextRange is the start and end string offsets in the reading order of the document.
*
* @return TextRange of this Node's TextBlock
*/
default TextRange getTextRange() {
return getTextBlock().getTextRange();
}
/**
* Returns the length of the text content in this Node's TextBlock.
*
* @return The length of the text content
*/
default int length() {
return getTextRange().length();
}
/**
* For a given TextRange this function returns a List of rectangle around the text in the range.
* These Rectangles are split either by a new line or by a large gap in the current line.
* This is mainly used to find the positions of TextEntities
*
* @param textRange A TextRange to calculate the positions for.
* @return A Map, where the keys are the pages and the values are a list of rectangles describing the position of words
*/
default Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange textRange) {
if (isLeaf()) {
return getTextBlock().getPositionsPerPage(textRange);
}
Optional<SemanticNode> containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange);
if (containingChildNode.isEmpty()) {
return getTextBlock().getPositionsPerPage(textRange);
}
return containingChildNode.get().getPositionsPerPage(textRange);
}
/**
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the Document, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<Page, Rectangle2D> getBBox() {
if (isLeaf()) {
return getBBoxFromLeafTextBlock();
}
return getBBoxFromChildren();
}
/**
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
*
* @param rectangle2D The rectangle to check if it is contained
* @param pageNumber The Page number on which the rectangle should be checked
* @return boolean
*/
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
Page helperPage = Page.builder().number(pageNumber).build();
if (!getPages().contains(helperPage)) {
return false;
}
return getBBox().get(helperPage).contains(rectangle2D);
}
/**
* TODO: this produces unwanted results for sections spanning multiple columns.
* Computes the Union of the bounding boxes of all children recursively.
*
* @return The union of the BoundingBoxes of all children
*/
private Map<Page, Rectangle2D> getBBoxFromChildren() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox)
.toList();
Set<Page> pages = childrenBBoxes.stream()
.flatMap(map -> map.keySet()
.stream())
.collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
}
return bBoxPerPage;
}
/**
* @return The union of all BoundingBoxes of the TextBlock of this node
*/
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
.stream()
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atomicTextBlocks) -> bBoxPerPage.put(page, RectangleTransformations.atomicTextBlockBBox(atomicTextBlocks)));
return bBoxPerPage;
}
/**
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
*
* @param page the page to check
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
*/
default boolean onlyOnPage(Page page) {
Set<Page> pages = getPages();
return pages.size() == 1 && pages.contains(page);
}
}

View File

@ -1,89 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
/**
* Represents a section within a document, encapsulating both its textual content and semantic structure.
*/
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class SuperSection extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
/**
* Checks if this section contains any tables.
*
* @return True if the section contains at least one table, false otherwise.
*/
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public SectionIdentifier getSectionIdentifier() {
return getHeadline().getSectionIdentifier();
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string.
*
* @param value The string to search for within headlines, case-sensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsString(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
*
* @param value The string to search for within headlines, case-insensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
}
}

View File

@ -1,306 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
/**
* Represents a table within a document.
*/
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Table implements SemanticNode {
@Builder.Default
Set<LayoutEngineProto.LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM));
@EqualsAndHashCode.Include
List<Integer> treeId;
DocumentTree documentTree;
int numberOfRows;
int numberOfCols;
TextBlock textBlock;
@Builder.Default
Set<TextEntity> entities = new HashSet<>();
Map<Page, Rectangle2D> bBoxCache;
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = SemanticNode.super.getBBox();
}
return bBoxCache;
}
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
public Stream<TextEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
return IntStream.range(0, numberOfRows).boxed()
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
String rowText = streamRow(row).map(TableCell::getTextBlock)
.collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
return strings.stream()
.map(String::toLowerCase)
.allMatch(rowText::contains);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Returns a TableCell at the provided row and column location.
*
* @param row int representing the row, must be smaller than numberOfRows
* @param col int representing the col, must be smaller than numberOfCols
* @return TableCell at the provided location in the table
*/
public TableCell getCell(int row, int col) {
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
}
int idx = row * numberOfCols + col;
return (TableCell) documentTree.getEntryById(treeId).getChildren().get(idx).getNode();
}
/**
* Streams all TableCells in this Table row-wise.
*
* @return Stream of all TableCells
*/
public Stream<TableCell> streamTableCells() {
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
}
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
public Stream<TableCell> streamTableCellsWithHeader(String header) {
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
.map(TableCell::getCol)
.flatMap(this::streamCol)
.filter(tableCellNode -> !tableCellNode.isHeader());
}
/**
* Streams all TableCells belonging to the provided column from top down.
*
* @param col int representing the column
* @return Stream of all TableCell in the provided column
*/
public Stream<TableCell> streamCol(int col) {
return IntStream.range(0, numberOfRows).boxed()
.map(row -> getCell(row, col));
}
/**
* Streams all TableCells belonging to the provided row from left to right.
*
* @param row int representing the row
* @return Stream of all TableCell in the provided row
*/
public Stream<TableCell> streamRow(int row) {
return IntStream.range(0, numberOfCols).boxed()
.map(col -> getCell(row, col));
}
/**
* Streams all TableCells row-wise and filters them with header == true.
*
* @return Stream of all TableCells with header == true
*/
public Stream<TableCell> streamHeaders() {
return streamTableCells().filter(TableCell::isHeader);
}
/**
* Streams all TableCells of the provided row and column and filters them with header == true.
*
* @param row int representing the row
* @param col int representing the column
* @return Stream of all TableCells with header == true in the provided row or col
*/
public Stream<TableCell> streamHeadersForCell(int row, int col) {
return Stream.concat(streamRow(row), streamCol(col))
.filter(TableCell::isHeader);
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeader(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeaderIgnoreCase(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock()
.getSearchText()
.strip()
.toLowerCase(Locale.ENGLISH)
.equals(header.toLowerCase(Locale.ENGLISH)));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
public boolean hasRowWithHeaderAndValue(String header, String value) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*
* @param header string to find header cells
* @param values List of strings to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*/
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
}
@Override
public NodeType getType() {
return NodeType.TABLE;
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = SemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,84 +0,0 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a single table cell within a table.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class TableCell extends AbstractSemanticNode {
int row;
int col;
boolean header;
Rectangle2D bBox;
TextBlock leafTextBlock;
TextBlock textBlock;
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public NodeType getType() {
return NodeType.TABLE_CELL;
}
@Override
public boolean isLeaf() {
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (isLeaf()) {
return leafTextBlock;
}
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,257 +0,0 @@
package com.knecon.fforesight.llm.service.document.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
Page page;
//string coordinates
TextRange textRange;
String searchText;
List<String> words;
List<Integer> lineBreaks;
//position coordinates
List<Integer> stringIdxToPositionIdx;
@Getter
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
return AtomicTextBlock.builder()
.id(textBlockIdx)
.textRange(new TextRange(stringOffset, stringOffset))
.searchText("")
.lineBreaks(Collections.emptyList())
.page(page)
.numberOnPage(numberOnPage)
.stringIdxToPositionIdx(Collections.emptyList())
.positions(Collections.emptyList())
.parent(parent)
.build();
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextDataProto.DocumentTextData atomicTextBlockData,
DocumentPositionDataProto.DocumentPositionData atomicPositionBlockData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder()
.id(atomicTextBlockData.getId())
.numberOnPage(atomicTextBlockData.getNumberOnPage())
.page(page)
.textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
.searchText(atomicTextBlockData.getSearchText())
.lineBreaks(atomicTextBlockData.getLineBreaksList())
.stringIdxToPositionIdx(atomicPositionBlockData.getStringIdxToPositionIdxList())
.positions(toRectangle2DList(atomicPositionBlockData.getPositionsList()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(List<DocumentPositionDataProto.DocumentPositionData.Position> positions) {
return positions.stream()
.map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3)))
.toList();
}
public TextRange getLineTextRange(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
return new TextRange(textRange.start(), textRange.start());
}
if (numberOfLines() == 1) {
return textRange;
}
if (lineNumber == 0) {
return new TextRange(textRange.start(), lineBreaks.get(0) + textRange.start());
} else if (lineNumber == numberOfLines() - 1) {
return new TextRange(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
}
return new TextRange(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
}
public List<String> getWords() {
if (words == null) {
words = new ArrayList<>();
BreakIterator iterator = BreakIterator.getWordInstance(Locale.ENGLISH);
iterator.setText(searchText);
int start = iterator.first();
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
words.add(searchText.substring(start, end));
}
}
return words;
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
.findFirst() //
.orElse(searchText.length()) + textRange.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
.reduce((a, b) -> b)//
.orElse(0) + textRange.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
if (!containsTextRange(stringTextRange)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
}
if (stringTextRange.length() == 0) {
return Collections.emptyList();
}
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
if (stringTextRange.end() == this.textRange.end()) {
return positions.subList(startPositionIdx, positions.size());
}
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
}
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleBBoxWithGaps)
.flatMap(Collection::stream)
.toList();
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
rectanglePerLinePerPage.put(page, rectanglesPerLine);
return rectanglePerLinePerPage;
}
@Override
public String subSequenceWithLineBreaks(TextRange textRange) {
if (textRange.length() == 0 || !getTextRange().contains(textRange)) {
return "";
}
Set<Integer> lbInBoundary = lineBreaks.stream()
.map(i -> i + textRange.start())
.filter(textRange::contains)
.collect(Collectors.toSet());
if (textRange.end() == getTextRange().end()) {
lbInBoundary.add(getTextRange().end());
}
StringBuilder sb = new StringBuilder();
for (int i = textRange.start(); i < textRange.end(); i++) {
char character = this.charAt(i);
if (lbInBoundary.contains(i + 1)) {
// always plus one, due to the linebreaks being an exclusive end index
if (!Character.isWhitespace(character)) {
lbInBoundary.remove(i + 1);
lbInBoundary.add(i + 2);
sb.append(character);
continue;
}
sb.append("\n");
} else {
sb.append(character);
}
}
return sb.toString();
}
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream()
.map(linebreak -> linebreak + this.textRange.start())
.filter(textRange::contains)
.toList();
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -1,268 +0,0 @@
package com.knecon.fforesight.llm.service.document.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
TextRange textRange;
public static ConcatenatedTextBlock empty() {
return new ConcatenatedTextBlock(Collections.emptyList());
}
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
textRange = new TextRange(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size())
.forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
if (this.atomicTextBlocks.isEmpty()) {
textRange.setStart(textBlock.getTextRange().start());
textRange.setEnd(textBlock.getTextRange().end());
} else if (textRange.end() != textBlock.getTextRange().start()) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
textRange.setEnd(textBlock.getTextRange().end());
this.searchText = null;
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream()
.filter(textBlock -> textBlock.getTextRange().contains(stringIdx))
.findAny()
.orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
List<AtomicTextBlock> intersectingAtomicTextBlocks = new LinkedList<>();
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
if (atomicTextBlock.getTextRange().start() > textRange.end()) {
break; // early stop, following TextBlocks will never intersect
}
if (atomicTextBlock.getTextRange().intersects(textRange)) {
intersectingAtomicTextBlocks.add(atomicTextBlock);
}
}
return intersectingAtomicTextBlocks;
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public List<String> getWords() {
return atomicTextBlocks.stream()
.map(AtomicTextBlock::getWords)
.flatMap(Collection::stream)
.toList();
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream()
.mapToInt(AtomicTextBlock::numberOfLines).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
.stream())
.toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
public TextRange getLineTextRange(int lineNumber) {
if (atomicTextBlocks.size() == 1) {
return atomicTextBlocks.get(0).getLineTextRange(lineNumber);
}
int lineNumberInCurrentBlock = lineNumber;
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
if (lineNumberInCurrentBlock < atomicTextBlock.numberOfLines()) {
return atomicTextBlock.getLineTextRange(lineNumberInCurrentBlock);
}
lineNumberInCurrentBlock -= atomicTextBlock.numberOfLines();
}
return new TextRange(textRange.start(), textRange.start());
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.isEmpty()) {
return Collections.emptyList();
}
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return positions;
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.isEmpty()) {
return new HashMap<>();
}
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
stringTextRange.end())));
return rectanglesPerLinePerPage;
}
@Override
public String subSequenceWithLineBreaks(TextRange textRange) {
if (textRange.length() == 0 || !getTextRange().contains(textRange)) {
return "";
}
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(textRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).subSequenceWithLineBreaks(textRange);
}
StringBuilder sb = new StringBuilder();
AtomicTextBlock firstTextBlock = textBlocks.get(0);
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(textRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
sb.append(textBlock.searchTextWithLineBreaks());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), textRange.end())));
return sb.toString();
}
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
rectangles,
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
.toList()));
return mergedMap;
}
@Override
public String toString() {
return getSearchText();
}
}

View File

@ -1,176 +0,0 @@
package com.knecon.fforesight.llm.service.document.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
public interface TextBlock extends CharSequence {
String getSearchText();
List<String> getWords();
List<AtomicTextBlock> getAtomicTextBlocks();
TextRange getTextRange();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
TextRange getLineTextRange(int lineNumber);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(TextRange stringTextRange);
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
String subSequenceWithLineBreaks(TextRange textRange);
int numberOfLines();
default CharSequence getLine(int lineNumber) {
return subSequence(getLineTextRange(lineNumber));
}
default List<Rectangle2D> getLinePositions(int lineNumber) {
return getPositions(getLineTextRange(lineNumber));
}
default Rectangle2D getLineBBox(int lineNumber) {
return RectangleTransformations.rectangle2DBBox(getLinePositions(lineNumber));
}
default String searchTextWithLineBreaks() {
return subSequenceWithLineBreaks(getTextRange());
}
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getTextRange().start());
}
default Set<Page> getPages() {
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default Set<Page> getPages(TextRange textRange) {
return getAtomicTextBlocks().stream()
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
if (start == -1) {
return -1;
}
return start + getTextRange().start();
}
default CharSequence getFirstLine() {
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
}
default boolean containsTextRange(TextRange textRange) {
if (textRange.end() < textRange.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
}
return getTextRange().contains(textRange);
}
default boolean containsIndex(int stringIndex) {
return getTextRange().contains(stringIndex);
}
default CharSequence subSequence(TextRange textRange) {
return subSequence(textRange.start(), textRange.end());
}
default String buildSummary() {
String searchText = getSearchText();
// substring, as splitting very large strings gets expensive
searchText = searchText.substring(0, Math.min(searchText.length(), 200));
String[] words = searchText.split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
}
@Override
default int length() {
return getTextRange().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getTextRange().start());
}
}

View File

@ -1,49 +0,0 @@
package com.knecon.fforesight.llm.service.document.textblock;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return ConcatenatedTextBlock::empty;
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -3,11 +3,12 @@ package com.knecon.fforesight.llm.service.models;
import java.util.List;
import java.util.Optional;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConsecutiveTextBlockCollector;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.ChunkingResponseData;
import com.knecon.fforesight.llm.service.document.ConsecutiveTextBlockCollector;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.extern.slf4j.Slf4j;
@ -20,7 +21,7 @@ public record Chunk(String markdown, List<TextBlock> parts) {
}
private static List<TextBlock> getChunkParts(com.knecon.fforesight.llm.service.document.nodes.Document document, List<List<Integer>> treeIds) {
private static List<TextBlock> getChunkParts(Document document, List<List<Integer>> treeIds) {
return treeIds.stream()
.map(treeId -> {

View File

@ -1,41 +1,21 @@
package com.knecon.fforesight.llm.service.services;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.primitives.Floats;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.service.redaction.v1.server.data.DocumentPageProto;
import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto;
import com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto;
import com.iqser.red.service.redaction.v1.server.data.DocumentStructureWrapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentGraphMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.llm.service.LlmNerMessage;
import com.knecon.fforesight.llm.service.document.DocumentData;
import com.knecon.fforesight.llm.service.document.DocumentGraphMapper;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.utils.StorageIdUtils;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@ -46,229 +26,24 @@ import lombok.extern.slf4j.Slf4j;
public class DocumentBuilderService {
StorageService storageService;
ObjectMapper mapper;
public Document build(LlmNerMessage llmNerMessage) {
DocumentData documentData = new DocumentData();
documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(fetchDocumentStructure(llmNerMessage.getDocumentStructureStorageId())));
documentData.setDocumentTextData(fetchDocumentTextData(llmNerMessage.getDocumentTextStorageId()));
documentData.setDocumentPositionData(fetchDocumentPositionData(llmNerMessage.getDocumentPositionStorageId()));
documentData.setDocumentPages(fetchAllDocumentPages(llmNerMessage.getDocumentPagesStorageId()));
documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(storageService.readProtoObject(TenantContext.getTenantId(),
llmNerMessage.getDocumentStructureStorageId(),
DocumentStructureProto.DocumentStructure.parser())));
documentData.setDocumentTextData(storageService.readProtoObject(TenantContext.getTenantId(),
llmNerMessage.getDocumentTextStorageId(),
DocumentTextDataProto.AllDocumentTextData.parser()));
documentData.setDocumentPositionData(storageService.readProtoObject(TenantContext.getTenantId(),
llmNerMessage.getDocumentPositionStorageId(),
DocumentPositionDataProto.AllDocumentPositionData.parser()));
documentData.setDocumentPages(storageService.readProtoObject(TenantContext.getTenantId(),
llmNerMessage.getDocumentPagesStorageId(),
DocumentPageProto.AllDocumentPages.parser()));
return DocumentGraphMapper.toDocumentGraph(documentData);
}
private DocumentStructureProto.DocumentStructure fetchDocumentStructure(String storageId) {
DocumentStructureProto.DocumentStructure documentStructure;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
documentStructure = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentStructureProto.DocumentStructure.parser());
} else {
DocumentStructure oldDocumentStructure = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentStructure.class);
if (oldDocumentStructure == null) {
return null;
}
documentStructure = convertDocumentStructure(oldDocumentStructure);
}
return documentStructure;
}
private DocumentTextDataProto.AllDocumentTextData fetchDocumentTextData(String storageId) {
DocumentTextDataProto.AllDocumentTextData documentTextData;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
documentTextData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentTextDataProto.AllDocumentTextData.parser());
} else {
DocumentTextData[] oldDocumentTextData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentTextData[].class);
if (oldDocumentTextData == null) {
return null;
}
documentTextData = convertAllDocumentTextData(oldDocumentTextData);
}
return documentTextData;
}
private DocumentPositionDataProto.AllDocumentPositionData fetchDocumentPositionData(String storageId) {
DocumentPositionDataProto.AllDocumentPositionData documentPositionData;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
documentPositionData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPositionDataProto.AllDocumentPositionData.parser());
} else {
DocumentPositionData[] oldDocumentPositionData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPositionData[].class);
if (oldDocumentPositionData == null) {
return null;
}
documentPositionData = convertAllDocumentPositionData(oldDocumentPositionData);
}
return documentPositionData;
}
private DocumentPageProto.AllDocumentPages fetchAllDocumentPages(String storageId) {
DocumentPageProto.AllDocumentPages allDocumentPages;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
allDocumentPages = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPageProto.AllDocumentPages.parser());
} else {
DocumentPage[] oldDocumentPages = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPage[].class);
if (oldDocumentPages == null) {
return null;
}
allDocumentPages = convertAllDocumentPages(oldDocumentPages);
}
return allDocumentPages;
}
private <T> T getOldData(String dossierId, String fileId, String fileType, Class<T> valueType) {
String oldStorageId = StorageIdUtils.getStorageId(dossierId, fileId, fileType, ".json");
System.out.println("----------------> LOOKING FOR " + oldStorageId);
try (InputStream inputStream = getObject(TenantContext.getTenantId(), oldStorageId)) {
return mapper.readValue(inputStream, valueType);
} catch (IOException e) {
log.error("Could not read JSON for " + fileType + ", error was: " + e);
return null;
}
}
private static EntryDataProto.EntryData convertEntryData(DocumentStructure.EntryData oldEntryData) {
EntryDataProto.EntryData.Builder builder = EntryDataProto.EntryData.newBuilder();
builder.setType(NodeTypeProto.NodeType.valueOf(oldEntryData.getType().name()));
builder.addAllTreeId(Arrays.stream(oldEntryData.getTreeId()).boxed()
.collect(Collectors.toList()));
builder.addAllAtomicBlockIds(Arrays.asList(oldEntryData.getAtomicBlockIds()));
builder.addAllPageNumbers(Arrays.asList(oldEntryData.getPageNumbers()));
builder.putAllProperties(oldEntryData.getProperties());
if (oldEntryData.getChildren() != null) {
oldEntryData.getChildren()
.forEach(child -> builder.addChildren(convertEntryData(child)));
}
return builder.build();
}
private static DocumentStructureProto.DocumentStructure convertDocumentStructure(DocumentStructure oldStructure) {
DocumentStructureProto.DocumentStructure.Builder newBuilder = DocumentStructureProto.DocumentStructure.newBuilder();
if (oldStructure.getRoot() != null) {
newBuilder.setRoot(convertEntryData(oldStructure.getRoot()));
}
return newBuilder.build();
}
private static DocumentPageProto.DocumentPage convertDocumentPage(DocumentPage oldPage) {
return DocumentPageProto.DocumentPage.newBuilder()
.setNumber(oldPage.getNumber())
.setHeight(oldPage.getHeight())
.setWidth(oldPage.getWidth())
.setRotation(oldPage.getRotation())
.build();
}
private static DocumentPageProto.AllDocumentPages convertAllDocumentPages(DocumentPage[] oldPages) {
DocumentPageProto.AllDocumentPages.Builder allPagesBuilder = DocumentPageProto.AllDocumentPages.newBuilder();
for (DocumentPage oldPage : oldPages) {
DocumentPageProto.DocumentPage newPage = convertDocumentPage(oldPage);
allPagesBuilder.addDocumentPages(newPage);
}
return allPagesBuilder.build();
}
private static DocumentPositionDataProto.DocumentPositionData convertDocumentPositionData(DocumentPositionData oldData) {
DocumentPositionDataProto.DocumentPositionData.Builder builder = DocumentPositionDataProto.DocumentPositionData.newBuilder()
.setId(oldData.getId())
.addAllStringIdxToPositionIdx(Arrays.stream(oldData.getStringIdxToPositionIdx()).boxed()
.collect(Collectors.toList()));
for (float[] pos : oldData.getPositions()) {
DocumentPositionDataProto.DocumentPositionData.Position position = DocumentPositionDataProto.DocumentPositionData.Position.newBuilder()
.addAllValue(Floats.asList(pos))
.build();
builder.addPositions(position);
}
return builder.build();
}
private static DocumentPositionDataProto.AllDocumentPositionData convertAllDocumentPositionData(DocumentPositionData[] oldDataList) {
DocumentPositionDataProto.AllDocumentPositionData.Builder allDataBuilder = DocumentPositionDataProto.AllDocumentPositionData.newBuilder();
for (DocumentPositionData oldData : oldDataList) {
allDataBuilder.addDocumentPositionData(convertDocumentPositionData(oldData));
}
return allDataBuilder.build();
}
private static DocumentTextDataProto.DocumentTextData convertDocumentTextData(DocumentTextData oldData) {
DocumentTextDataProto.DocumentTextData.Builder builder = DocumentTextDataProto.DocumentTextData.newBuilder()
.setId(oldData.getId())
.setPage(oldData.getPage())
.setSearchText(oldData.getSearchText())
.setNumberOnPage(oldData.getNumberOnPage())
.setStart(oldData.getStart())
.setEnd(oldData.getEnd())
.addAllLineBreaks(Arrays.stream(oldData.getLineBreaks()).boxed()
.collect(Collectors.toList()));
return builder.build();
}
private static DocumentTextDataProto.AllDocumentTextData convertAllDocumentTextData(DocumentTextData[] oldDataList) {
DocumentTextDataProto.AllDocumentTextData.Builder allDataBuilder = DocumentTextDataProto.AllDocumentTextData.newBuilder();
for (DocumentTextData oldData : oldDataList) {
allDataBuilder.addDocumentTextData(convertDocumentTextData(oldData));
}
return allDataBuilder.build();
}
@SneakyThrows
private InputStream getObject(String tenantId, String storageId) {
File tempFile = File.createTempFile("temp", ".data");
storageService.downloadTo(tenantId, storageId, tempFile);
return new BufferedInputStream(Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE));
}
}

View File

@ -22,6 +22,8 @@ import com.azure.ai.openai.models.ChatRequestUserMessage;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.llm.service.ChunkingResponse;
import com.knecon.fforesight.llm.service.EntityAiDescription;
@ -29,8 +31,6 @@ import com.knecon.fforesight.llm.service.LlmNerEntities;
import com.knecon.fforesight.llm.service.LlmNerEntity;
import com.knecon.fforesight.llm.service.LlmNerMessage;
import com.knecon.fforesight.llm.service.SystemMessageProvider;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.models.Chunk;
import com.knecon.fforesight.llm.service.utils.FormattingUtils;
import com.knecon.fforesight.tenantcommons.TenantContext;
@ -130,16 +130,15 @@ public class LlmNerService {
try {
entitiesWithUsage = mapEntitiesToDocument(chatCompletions, chunk.parts());
} catch (JsonProcessingException e) {
String faultyResponse = chatCompletions.getChoices()
.get(0).getMessage().getContent();
String faultyResponse = chatCompletions.getChoices().get(0).getMessage().getContent();
ChatCompletions correctionCompletions = runLLM(SystemMessageProvider.PROMPT_CORRECTION, faultyResponse);
try {
entitiesWithUsage = mapEntitiesToDocument(correctionCompletions, chunk.parts());
int completionTokens = chatCompletions.getUsage().getCompletionTokens() + correctionCompletions.getUsage().getCompletionTokens();
int promptTokens = chatCompletions.getUsage().getPromptTokens() + correctionCompletions.getUsage().getPromptTokens();
entitiesWithUsage = new EntitiesWithUsage(entitiesWithUsage.entities(), completionTokens, promptTokens);
entitiesWithUsage = new EntitiesWithUsage(entitiesWithUsage.entities(),
entitiesWithUsage.promptTokens() + chatCompletions.getUsage().getPromptTokens(),
entitiesWithUsage.completionTokens() + chatCompletions.getUsage().getCompletionTokens());
} catch (JsonProcessingException ex) {
throw new RuntimeException(ex);
@ -165,11 +164,10 @@ public class LlmNerService {
private EntitiesWithUsage mapEntitiesToDocument(ChatCompletions chatCompletions, List<TextBlock> chunkParts) throws JsonProcessingException {
EntitiesWithUsage allEntities = new EntitiesWithUsage(new LinkedList<>(), chatCompletions.getUsage().getCompletionTokens(), chatCompletions.getUsage().getPromptTokens());
EntitiesWithUsage allEntities = new EntitiesWithUsage(new LinkedList<>(), chatCompletions.getUsage().getPromptTokens(), chatCompletions.getUsage().getCompletionTokens());
if (!chatCompletions.getChoices().isEmpty()) {
ChatChoice choice = chatCompletions.getChoices()
.get(0);
ChatChoice choice = chatCompletions.getChoices().get(0);
Map<String, List<String>> entitiesPerType = parseResponse(choice);
List<LlmNerEntity> entitiesFromResponse = entitiesPerType.entrySet()
@ -236,7 +234,7 @@ public class LlmNerService {
}
private record EntitiesWithUsage(List<LlmNerEntity> entities, int completionTokens, int promptTokens) {
private record EntitiesWithUsage(List<LlmNerEntity> entities, int promptTokens, int completionTokens) {
}

View File

@ -36,7 +36,7 @@ dependencies {
implementation("com.knecon.fforesight:keycloak-commons:0.30.0") {
exclude(group = "com.knecon.fforesight", module = "tenant-commons")
}
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
implementation("com.knecon.fforesight:tenant-commons:0.31.0")
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
implementation("ch.qos.logback:logback-classic")

View File

@ -1,67 +0,0 @@
package com.knecon.fforesight.llm.service.queue;
import java.util.Map;
import java.util.Set;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.llm.service.QueueNames;
import com.knecon.fforesight.tenantcommons.TenantProvider;
import com.knecon.fforesight.tenantcommons.model.TenantCreatedEvent;
import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration;
import com.knecon.fforesight.tenantcommons.model.TenantResponse;
import com.knecon.fforesight.tenantcommons.queue.RabbitQueueFromExchangeService;
import com.knecon.fforesight.tenantcommons.queue.TenantExchangeMessageReceiver;
@Service
public class TenantExchangeMessageReceiverImpl extends TenantExchangeMessageReceiver {
public TenantExchangeMessageReceiverImpl(RabbitQueueFromExchangeService rabbitQueueService, TenantProvider tenantProvider) {
super(rabbitQueueService, tenantProvider);
}
@Override
protected Set<TenantQueueConfiguration> getTenantQueueConfigs() {
return Set.of(TenantQueueConfiguration.builder()
.listenerId(MessageHandler.LLM_NER_REQUEST_LISTENER_ID)
.exchangeName(QueueNames.LLM_NER_REQUEST_EXCHANGE)
.queuePrefix(QueueNames.LLM_NER_REQUEST_QUEUE_PREFIX)
.dlqName(QueueNames.LLM_NER_DLQ)
.arguments(Map.of("x-max-priority", 2))
.build());
}
@EventListener(ApplicationReadyEvent.class)
public void onApplicationReady() {
System.out.println("application ready invoked");
super.initializeQueues();
}
@RabbitHandler
@RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantCreatedQueueName()}")
public void reactToTenantCreation(TenantCreatedEvent tenantCreatedEvent) {
super.reactToTenantCreation(tenantCreatedEvent);
}
@RabbitHandler
@RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantDeletedQueueName()}")
public void reactToTenantDeletion(TenantResponse tenantResponse) {
super.reactToTenantDeletion(tenantResponse);
}
}

View File

@ -1,11 +0,0 @@
package com.knecon.fforesight.llm.service.queue;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.tenantcommons.queue.TenantMessagingConfiguration;
@Configuration
public class TenantMessagingConfigurationImpl extends TenantMessagingConfiguration {
}

View File

@ -0,0 +1,28 @@
package com.knecon.fforesight.llm.service.queue;
import java.util.Map;
import java.util.Set;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.llm.service.QueueNames;
import com.knecon.fforesight.tenantcommons.model.TenantQueueProvider;
@Configuration
public class TenantQueueProviderConfig {
@Bean
protected TenantQueueProvider getTenantQueueConfigs() {
return new TenantQueueProvider(Set.of(com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration.builder()
.listenerId(MessageHandler.LLM_NER_REQUEST_LISTENER_ID)
.exchangeName(QueueNames.LLM_NER_REQUEST_EXCHANGE)
.queuePrefix(QueueNames.LLM_NER_REQUEST_QUEUE_PREFIX)
.dlqName(QueueNames.LLM_NER_DLQ)
.arguments(Map.of("x-max-priority", 2))
.build()));
}
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.llm.service.websocket;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.security.core.parameters.P;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.llm.service.services.WebSocketMessagingTemplate;

View File

@ -37,7 +37,7 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
@SneakyThrows
public void testLlmNer() {
Path folder = Path.of("/Users/maverickstuder/Downloads/10-09-2024-16-03-47_files_list");
Path folder = Path.of("/home/kschuettler/Downloads/New Folder (5)/18299ec0-7659-496a-a44a-194bbffb1700/1fb7d49ae389469f60db516cf81a3510");
LlmNerMessage message = prepStorage(folder);
llmNerService.runNer(message);
Path tmpFile = Path.of("/private/tmp", "LLM_ENTITIES", "entities.json");

View File

@ -13,5 +13,5 @@ keyword-service.url: "http://mock.url"
llm-service:
azureOpenAiKey: "Your key here"
azureOpenAiKey: "679b023858314dfe807e50a2e7d86d63"
azureOpenAiEndpoint: "https://knecon-ca-demo.openai.azure.com/"

View File

@ -1 +0,0 @@
{"entities":[{"value":"Kalt R.","type":"PII","startOffset":1951,"endOffset":1958},{"value":"Kalt R.","type":"PII","startOffset":3338,"endOffset":3345},{"value":"Kalt R.","type":"PII","startOffset":3476,"endOffset":3483},{"value":"Kalt R.","type":"PII","startOffset":3821,"endOffset":3828},{"value":"Jackson W.A.","type":"PII","startOffset":2286,"endOffset":2298},{"value":"Jackson W.A.","type":"PII","startOffset":2790,"endOffset":2802},{"value":"Jackson W.A.","type":"PII","startOffset":2911,"endOffset":2923},{"value":"Jackson W.A.","type":"PII","startOffset":3096,"endOffset":3108},{"value":"Kalt R.","type":"PII","startOffset":5055,"endOffset":5062},{"value":"Kalt R.","type":"PII","startOffset":5233,"endOffset":5240},{"value":"Kalt R.","type":"PII","startOffset":5895,"endOffset":5902},{"value":"Kalt R.","type":"PII","startOffset":5909,"endOffset":5916},{"value":"Kalt R.","type":"PII","startOffset":5931,"endOffset":5938},{"value":"Kalt R.","type":"PII","startOffset":5960,"endOffset":5967},{"value":"Kalt R.","type":"PII","startOffset":5989,"endOffset":5996},{"value":"Kalt R.","type":"PII","startOffset":6018,"endOffset":6025},{"value":"Kalt R.","type":"PII","startOffset":7253,"endOffset":7260},{"value":"Kalt R.","type":"PII","startOffset":7281,"endOffset":7288},{"value":"Kalt R.","type":"PII","startOffset":7309,"endOffset":7316},{"value":"Kalt R.","type":"PII","startOffset":7337,"endOffset":7344},{"value":"Kalt R. 2009c","type":"PII","startOffset":10056,"endOffset":10069},{"value":"Kalt R.","type":"PII","startOffset":10767,"endOffset":10774},{"value":"Kalt R.","type":"PII","startOffset":10780,"endOffset":10787},{"value":"Kalt R.","type":"PII","startOffset":10802,"endOffset":10809},{"value":"Kalt R.","type":"PII","startOffset":10830,"endOffset":10837},{"value":"Kalt R.","type":"PII","startOffset":10858,"endOffset":10865},{"value":"Kalt R.","type":"PII","startOffset":10886,"endOffset":10893},{"value":"Kalt R.","type":"PII","startOffset":11980,"endOffset":11987},{"value":"Kalt R.","type":"PII","startOffset":12008,"endOffset":12015},{"value":"Kalt R.","type":"PII","startOffset":12036,"endOffset":12043},{"value":"Kalt R.","type":"PII","startOffset":12064,"endOffset":12071},{"value":"Kalt R.","type":"PII","startOffset":13814,"endOffset":13821},{"value":"Kalt R.","type":"PII","startOffset":14598,"endOffset":14605},{"value":"Kalt R.","type":"PII","startOffset":14855,"endOffset":14862},{"value":"Kalt R.","type":"PII","startOffset":15149,"endOffset":15156},{"value":"Kalt R.","type":"PII","startOffset":15481,"endOffset":15488},{"value":"Kalt R. 2009c","type":"PII","startOffset":16392,"endOffset":16405},{"value":"Kalt R.","type":"PII","startOffset":17850,"endOffset":17857},{"value":"Kalt R.","type":"PII","startOffset":18284,"endOffset":18291},{"value":"Kalt R.","type":"PII","startOffset":18932,"endOffset":18939},{"value":"Kalt R.","type":"PII","startOffset":19412,"endOffset":19419},{"value":"Kalt R.","type":"PII","startOffset":19660,"endOffset":19667},{"value":"Kalt R.","type":"PII","startOffset":19973,"endOffset":19980},{"value":"Kalt R.","type":"PII","startOffset":20246,"endOffset":20253},{"value":"Kalt R.","type":"PII","startOffset":20522,"endOffset":20529},{"value":"Jackson W.","type":"PII","startOffset":19197,"endOffset":19207},{"value":"Briswalter C.","type":"PII","startOffset":20778,"endOffset":20791},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19003,"endOffset":19052},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19529,"endOffset":19578},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19776,"endOffset":19825},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20046,"endOffset":20095},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20362,"endOffset":20411},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20638,"endOffset":20687},{"value":"Syngenta Technology & Projects, Huddersfield, United Kingdom","type":"ADDRESS","startOffset":19265,"endOffset":19325},{"value":"Syngenta Crop Protection AG, Basel, Switzerland","type":"ADDRESS","startOffset":20809,"endOffset":20856}]}