Merge branch 'RED-9123' into 'main'

RED-9123: Protobuf serialization of document data files

See merge request fforesight/llm-service!24
This commit is contained in:
Maverick Studer 2024-10-07 13:37:34 +02:00
commit 2d66b1e5d4
13 changed files with 485 additions and 118 deletions

View File

@ -13,10 +13,13 @@ extra["testcontainersVersion"] = "1.20.0"
dependencies {
implementation(project(":llm-service-api"))
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.159.0")
implementation("com.iqser.red.commons:storage-commons:2.49.0")
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.181.0")
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.springframework.boot:spring-boot-starter:3.1.1")
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
exclude(group = "com.iqser.red.commons", module = "storage-commons")
}
implementation("com.azure:azure-ai-openai:1.0.0-beta.10")
implementation("ch.qos.logback:logback-classic:1.5.7")
implementation("com.google.protobuf:protobuf-java:4.27.1")
}

View File

@ -2,10 +2,11 @@ package com.knecon.fforesight.llm.service.document;
import java.io.Serializable;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -21,9 +22,15 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentData implements Serializable {
DocumentPage[] documentPages;
DocumentTextData[] documentTextData;
DocumentPositionData[] documentPositionData;
DocumentStructure documentStructure;
DocumentPageProto.AllDocumentPages documentPages;
DocumentTextDataProto.AllDocumentTextData documentTextData;
DocumentPositionDataProto.AllDocumentPositionData documentPositionData;
DocumentStructureWrapper documentStructureWrapper;
public DocumentStructureProto.DocumentStructure getDocumentStructure() {
return documentStructureWrapper.getDocumentStructure();
}
}

View File

@ -1,9 +1,11 @@
package com.knecon.fforesight.llm.service.document;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
@ -25,10 +27,6 @@ import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import lombok.experimental.UtilityClass;
@ -41,28 +39,30 @@ public class DocumentGraphMapper {
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pageData.addAll(Arrays.stream(documentData.getDocumentPages())
context.pageData.addAll(documentData.getDocumentPages().getDocumentPagesList()
.stream()
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
document.setDocumentTree(context.documentTree);
document.setPages(new HashSet<>(context.pageData));
document.setNumberOfPages(documentData.getDocumentPages().length);
document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount());
document.setTextBlock(document.getTextBlock());
return document;
}
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
private List<DocumentTree.Entry> buildEntries(List<EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
for (DocumentStructure.EntryData entryData : entries) {
for (EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
.map(pageNumber -> getPage(pageNumber, context))
List<Page> pages = entryData.getPageNumbersList()
.stream()
.map(context::getPage)
.toList();
SemanticNode node = switch (entryData.getType()) {
@ -74,33 +74,30 @@ public class DocumentGraphMapper {
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIds().length > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
if (entryData.getAtomicBlockIdsCount() > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
node.setLeafTextBlock(textBlock);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node));
default -> textBlock.getAtomicTextBlocks()
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
}
}
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
.toList();
if (entryData.getEngines() != null) {
entryData.getEngines()
.forEach(node::addEngine);
} else {
entryData.setEngines(Collections.emptySet());
}
List<Integer> treeId = entryData.getTreeIdList();
entryData.getEnginesList()
.forEach(node::addEngine);
node.setTreeId(treeId);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
}
return newEntries;
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
} return newEntries;
}
@ -110,10 +107,10 @@ public class DocumentGraphMapper {
}
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
private Image buildImage(Context context, Map<String, String> properties, List<Long> pageNumbers) {
assert pageNumbers.length == 1;
Page page = getPage(pageNumbers[0], context);
assert pageNumbers.size() == 1;
Page page = context.getPage(pageNumbers.get(0));
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
@ -159,13 +156,14 @@ public class DocumentGraphMapper {
return SuperSection.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
@ -174,9 +172,9 @@ public class DocumentGraphMapper {
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
private TextBlock toTextBlock(List<Long> atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
return atomicTextBlockIds.stream()
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
.collect(new TextBlockCollector());
}
@ -184,24 +182,16 @@ public class DocumentGraphMapper {
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)),
context.documentPositionData.getDocumentPositionData(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
context.getPage(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)).getPage()));
}
private Page buildPage(DocumentPage p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private Page getPage(Long pageIndex, Context context) {
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
return page;
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
}
@ -209,21 +199,27 @@ public class DocumentGraphMapper {
private final DocumentTree documentTree;
private final List<Page> pageData;
private final List<DocumentTextData> documentTextData;
private final List<DocumentPositionData> documentPositionData;
private final AllDocumentTextData documentTextData;
private final AllDocumentPositionData documentPositionData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pageData = new ArrayList<>();
this.documentTextData = Arrays.stream(documentData.getDocumentTextData())
.toList();
this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData())
.toList();
this.documentTextData = documentData.getDocumentTextData();
this.documentPositionData = documentData.getDocumentPositionData();
}
private Page getPage(Long pageIndex) {
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
return page;
}
}
}

View File

@ -9,7 +9,7 @@ import com.knecon.fforesight.llm.service.document.nodes.Image;
import com.knecon.fforesight.llm.service.document.nodes.ImageType;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import lombok.experimental.UtilityClass;
@ -18,32 +18,32 @@ public class PropertiesMapper {
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
builder.imageType(ImageType.fromString(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT)));
builder.position(parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION)));
builder.id(properties.get(DocumentStructure.ImageProperties.ID));
builder.imageType(ImageType.fromString(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
builder.position(parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
builder.id(properties.get(DocumentStructureWrapper.ImageProperties.ID));
}
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER)));
builder.bBox(parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX)));
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
builder.bBox(parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
}
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
}
private Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(DocumentStructure.RECTANGLE_DELIMITER))
List<Float> floats = Arrays.stream(bBox.split(DocumentStructureWrapper.RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
@ -52,21 +52,21 @@ public class PropertiesMapper {
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
public static List<Long> toLongList(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
.map(Long::valueOf)
.toArray(Long[]::new);
.toList();
}
}

View File

@ -9,7 +9,7 @@ import java.util.Set;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -31,7 +31,7 @@ import lombok.extern.slf4j.Slf4j;
public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
Set<LayoutEngineProto.LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM));
@EqualsAndHashCode.Include
List<Integer> treeId;

View File

@ -5,6 +5,7 @@ import java.util.List;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
@ -35,7 +36,8 @@ public class Page {
Integer width;
Integer rotation;
List<SemanticNode> mainBody;
List<AtomicTextBlock> textBlocksOnPage;
Header header;
Footer footer;
@ -53,13 +55,36 @@ public class Page {
*/
public TextBlock getMainBodyTextBlock() {
return mainBody.stream()
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
return textBlocksOnPage.stream()
.filter(atb -> !atb.isEmpty())
.collect(new TextBlockCollector());
}
/**
* Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page.
*
* @return A list which contains the highes SemanticNodes, which appear only on this page.
*/
public List<SemanticNode> getMainBody() {
return textBlocksOnPage.stream()
.map(AtomicTextBlock::getParent)
.map(this::getHighestParentOnlyOnPage)
.distinct()
.toList();
}
private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) {
SemanticNode currentNode = node;
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
currentNode = currentNode.getParent();
}
return currentNode;
}
@Override
public String toString() {

View File

@ -21,7 +21,7 @@ import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
public interface SemanticNode {
@ -287,11 +287,10 @@ public interface SemanticNode {
return getTextBlock().getSearchText().contains(string);
}
Set<LayoutEngine> getEngines();
Set<LayoutEngineProto.LayoutEngine> getEngines();
default void addEngine(LayoutEngine engine) {
default void addEngine(LayoutEngineProto.LayoutEngine engine) {
getEngines().add(engine);
}
@ -669,4 +668,17 @@ public interface SemanticNode {
return bBoxPerPage;
}
/**
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
*
* @param page the page to check
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
*/
default boolean onlyOnPage(Page page) {
Set<Page> pages = getPages();
return pages.size() == 1 && pages.contains(page);
}
}

View File

@ -16,7 +16,7 @@ import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -36,7 +36,7 @@ import lombok.experimental.FieldDefaults;
public class Table implements SemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
Set<LayoutEngineProto.LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM));
@EqualsAndHashCode.Include
List<Integer> treeId;
DocumentTree documentTree;

View File

@ -5,7 +5,6 @@ import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@ -19,8 +18,8 @@ import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -78,7 +77,10 @@ public class AtomicTextBlock implements TextBlock {
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData atomicTextBlockData, DocumentPositionData atomicPositionBlockData, SemanticNode parent, Page page) {
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextDataProto.DocumentTextData atomicTextBlockData,
DocumentPositionDataProto.DocumentPositionData atomicPositionBlockData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder()
.id(atomicTextBlockData.getId())
@ -86,20 +88,18 @@ public class AtomicTextBlock implements TextBlock {
.page(page)
.textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
.searchText(atomicTextBlockData.getSearchText())
.lineBreaks(Arrays.stream(atomicTextBlockData.getLineBreaks()).boxed()
.toList())
.stringIdxToPositionIdx(Arrays.stream(atomicPositionBlockData.getStringIdxToPositionIdx()).boxed()
.toList())
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
.lineBreaks(atomicTextBlockData.getLineBreaksList())
.stringIdxToPositionIdx(atomicPositionBlockData.getStringIdxToPositionIdxList())
.positions(toRectangle2DList(atomicPositionBlockData.getPositionsList()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
private static List<Rectangle2D> toRectangle2DList(List<DocumentPositionDataProto.DocumentPositionData.Position> positions) {
return Arrays.stream(positions)
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
return positions.stream()
.map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3)))
.toList();
}

View File

@ -1,7 +1,14 @@
package com.knecon.fforesight.llm.service.services;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@ -24,11 +31,14 @@ import com.azure.ai.openai.models.CompletionsUsage;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.primitives.Floats;
import com.iqser.red.storage.commons.exception.StorageException;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.llm.service.ChunkingResponse;
import com.knecon.fforesight.llm.service.LlmNerEntities;
import com.knecon.fforesight.llm.service.LlmNerEntity;
import com.knecon.fforesight.llm.service.LlmNerMessage;
import com.knecon.fforesight.llm.service.LlmServiceSettings;
import com.knecon.fforesight.llm.service.SystemMessages;
import com.knecon.fforesight.llm.service.document.DocumentData;
import com.knecon.fforesight.llm.service.document.DocumentGraphMapper;
@ -36,10 +46,18 @@ import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.models.Chunk;
import com.knecon.fforesight.llm.service.utils.FormattingUtils;
import com.knecon.fforesight.llm.service.utils.StorageIdUtils;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.AccessLevel;
@ -217,16 +235,227 @@ public class LlmNerService {
private Document buildDocument(LlmNerMessage llmNerMessage) {
DocumentData documentData = new DocumentData();
documentData.setDocumentStructure(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentStructureStorageId(), DocumentStructure.class));
documentData.setDocumentTextData(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentTextStorageId(), DocumentTextData[].class));
documentData.setDocumentPositionData(storageService.readJSONObject(TenantContext.getTenantId(),
llmNerMessage.getDocumentPositionStorageId(),
DocumentPositionData[].class));
documentData.setDocumentPages(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentPagesStorageId(), DocumentPage[].class));
documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(fetchDocumentStructure(llmNerMessage.getDocumentStructureStorageId())));
documentData.setDocumentTextData(fetchDocumentTextData(llmNerMessage.getDocumentTextStorageId()));
documentData.setDocumentPositionData(fetchDocumentPositionData(llmNerMessage.getDocumentPositionStorageId()));
documentData.setDocumentPages(fetchAllDocumentPages(llmNerMessage.getDocumentPagesStorageId()));
return DocumentGraphMapper.toDocumentGraph(documentData);
}
private DocumentStructureProto.DocumentStructure fetchDocumentStructure(String storageId) {
DocumentStructureProto.DocumentStructure documentStructure;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
documentStructure = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentStructureProto.DocumentStructure.parser());
} else {
DocumentStructure oldDocumentStructure = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentStructure.class);
if (oldDocumentStructure == null) {
return null;
}
documentStructure = convertDocumentStructure(oldDocumentStructure);
}
return documentStructure;
}
private DocumentTextDataProto.AllDocumentTextData fetchDocumentTextData(String storageId) {
DocumentTextDataProto.AllDocumentTextData documentTextData;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
documentTextData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentTextDataProto.AllDocumentTextData.parser());
} else {
DocumentTextData[] oldDocumentTextData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentTextData[].class);
if (oldDocumentTextData == null) {
return null;
}
documentTextData = convertAllDocumentTextData(oldDocumentTextData);
}
return documentTextData;
}
private DocumentPositionDataProto.AllDocumentPositionData fetchDocumentPositionData(String storageId) {
DocumentPositionDataProto.AllDocumentPositionData documentPositionData;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
documentPositionData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPositionDataProto.AllDocumentPositionData.parser());
} else {
DocumentPositionData[] oldDocumentPositionData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPositionData[].class);
if (oldDocumentPositionData == null) {
return null;
}
documentPositionData = convertAllDocumentPositionData(oldDocumentPositionData);
}
return documentPositionData;
}
private DocumentPageProto.AllDocumentPages fetchAllDocumentPages(String storageId) {
DocumentPageProto.AllDocumentPages allDocumentPages;
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
if (storageInfo.fileTypeExtension().contains("proto")) {
allDocumentPages = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPageProto.AllDocumentPages.parser());
} else {
DocumentPage[] oldDocumentPages = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPage[].class);
if (oldDocumentPages == null) {
return null;
}
allDocumentPages = convertAllDocumentPages(oldDocumentPages);
}
return allDocumentPages;
}
private <T> T getOldData(String dossierId, String fileId, String fileType, Class<T> valueType) {
String oldStorageId = StorageIdUtils.getStorageId(dossierId, fileId, fileType, ".json");
System.out.println("----------------> LOOKING FOR " + oldStorageId);
try (InputStream inputStream = getObject(TenantContext.getTenantId(), oldStorageId)) {
return mapper.readValue(inputStream, valueType);
} catch (IOException e) {
log.error("Could not read JSON for " + fileType + ", error was: " + e);
return null;
}
}
private static EntryDataProto.EntryData convertEntryData(DocumentStructure.EntryData oldEntryData) {
EntryDataProto.EntryData.Builder builder = EntryDataProto.EntryData.newBuilder();
builder.setType(NodeTypeProto.NodeType.valueOf(oldEntryData.getType().name()));
builder.addAllTreeId(Arrays.stream(oldEntryData.getTreeId()).boxed()
.collect(Collectors.toList()));
builder.addAllAtomicBlockIds(Arrays.asList(oldEntryData.getAtomicBlockIds()));
builder.addAllPageNumbers(Arrays.asList(oldEntryData.getPageNumbers()));
builder.putAllProperties(oldEntryData.getProperties());
if (oldEntryData.getChildren() != null) {
oldEntryData.getChildren()
.forEach(child -> builder.addChildren(convertEntryData(child)));
}
return builder.build();
}
private static DocumentStructureProto.DocumentStructure convertDocumentStructure(DocumentStructure oldStructure) {
DocumentStructureProto.DocumentStructure.Builder newBuilder = DocumentStructureProto.DocumentStructure.newBuilder();
if (oldStructure.getRoot() != null) {
newBuilder.setRoot(convertEntryData(oldStructure.getRoot()));
}
return newBuilder.build();
}
private static DocumentPageProto.DocumentPage convertDocumentPage(DocumentPage oldPage) {
return DocumentPageProto.DocumentPage.newBuilder()
.setNumber(oldPage.getNumber())
.setHeight(oldPage.getHeight())
.setWidth(oldPage.getWidth())
.setRotation(oldPage.getRotation())
.build();
}
private static DocumentPageProto.AllDocumentPages convertAllDocumentPages(DocumentPage[] oldPages) {
DocumentPageProto.AllDocumentPages.Builder allPagesBuilder = DocumentPageProto.AllDocumentPages.newBuilder();
for (DocumentPage oldPage : oldPages) {
DocumentPageProto.DocumentPage newPage = convertDocumentPage(oldPage);
allPagesBuilder.addDocumentPages(newPage);
}
return allPagesBuilder.build();
}
private static DocumentPositionDataProto.DocumentPositionData convertDocumentPositionData(DocumentPositionData oldData) {
DocumentPositionDataProto.DocumentPositionData.Builder builder = DocumentPositionDataProto.DocumentPositionData.newBuilder()
.setId(oldData.getId())
.addAllStringIdxToPositionIdx(Arrays.stream(oldData.getStringIdxToPositionIdx()).boxed()
.collect(Collectors.toList()));
for (float[] pos : oldData.getPositions()) {
DocumentPositionDataProto.DocumentPositionData.Position position = DocumentPositionDataProto.DocumentPositionData.Position.newBuilder()
.addAllValue(Floats.asList(pos))
.build();
builder.addPositions(position);
}
return builder.build();
}
private static DocumentPositionDataProto.AllDocumentPositionData convertAllDocumentPositionData(DocumentPositionData[] oldDataList) {
DocumentPositionDataProto.AllDocumentPositionData.Builder allDataBuilder = DocumentPositionDataProto.AllDocumentPositionData.newBuilder();
for (DocumentPositionData oldData : oldDataList) {
allDataBuilder.addDocumentPositionData(convertDocumentPositionData(oldData));
}
return allDataBuilder.build();
}
private static DocumentTextDataProto.DocumentTextData convertDocumentTextData(DocumentTextData oldData) {
DocumentTextDataProto.DocumentTextData.Builder builder = DocumentTextDataProto.DocumentTextData.newBuilder()
.setId(oldData.getId())
.setPage(oldData.getPage())
.setSearchText(oldData.getSearchText())
.setNumberOnPage(oldData.getNumberOnPage())
.setStart(oldData.getStart())
.setEnd(oldData.getEnd())
.addAllLineBreaks(Arrays.stream(oldData.getLineBreaks()).boxed()
.collect(Collectors.toList()));
return builder.build();
}
private static DocumentTextDataProto.AllDocumentTextData convertAllDocumentTextData(DocumentTextData[] oldDataList) {
DocumentTextDataProto.AllDocumentTextData.Builder allDataBuilder = DocumentTextDataProto.AllDocumentTextData.newBuilder();
for (DocumentTextData oldData : oldDataList) {
allDataBuilder.addDocumentTextData(convertDocumentTextData(oldData));
}
return allDataBuilder.build();
}
@SneakyThrows
private InputStream getObject(String tenantId, String storageId) {
File tempFile = File.createTempFile("temp", ".data");
storageService.downloadTo(tenantId, storageId, tempFile);
return new BufferedInputStream(Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE));
}
private record EntitiesWithUsage(List<LlmNerEntity> entities, CompletionsUsage completionsUsage) {
}

View File

@ -0,0 +1,48 @@
package com.knecon.fforesight.llm.service.utils;
import java.util.Arrays;
import lombok.experimental.UtilityClass;
@UtilityClass
public class StorageIdUtils {
public static final String INVALID_STORAGE_ID_FORMAT = "Invalid storageId format";
public String getStorageId(String dossierId, String fileId, String fileName, String fileExtension) {
return dossierId + "/" + fileId + "." + fileName + fileExtension;
}
public static StorageInfo parseStorageId(String storageId) {
String[] parts = storageId.split("/", 2);
if (parts.length < 2) {
throw new IllegalArgumentException(INVALID_STORAGE_ID_FORMAT);
}
String dossierId = parts[0];
String fileAndType = parts[1];
String[] fileParts = fileAndType.split("\\.");
if (fileParts.length < 3) {
throw new IllegalArgumentException(INVALID_STORAGE_ID_FORMAT);
}
String fileId = fileParts[0];
String fileTypeExtension = fileParts[fileParts.length - 1];
String fileTypeName = String.join(".", Arrays.copyOfRange(fileParts, 1, fileParts.length - 1));
return new StorageInfo(dossierId, fileId, fileTypeName, fileTypeExtension);
}
public record StorageInfo(String dossierId, String fileId, String fileTypeName, String fileTypeExtension) {
}
}

View File

@ -32,8 +32,10 @@ dependencies {
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.3")
implementation("org.springframework.boot:spring-boot-starter-websocket:$springBootVersion")
implementation("org.springframework.security:spring-security-messaging:$springSecurityVersion")
implementation("com.iqser.red.commons:storage-commons:2.49.0")
implementation("com.knecon.fforesight:keycloak-commons:0.30.0")
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("com.knecon.fforesight:keycloak-commons:0.30.0") {
exclude(group = "com.knecon.fforesight", module = "tenant-commons")
}
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
implementation("ch.qos.logback:logback-classic")

View File

@ -0,0 +1,45 @@
package com.knecon.fforesight.llm.service;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.llm.service.utils.StorageIdUtils;
public class StorageIdUtilsTest {
@Test
void testParseStorageId_ValidInput() {
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId("dossierId/fileId.DOCUMENT_STRUCTURE.json");
assertEquals("dossierId", storageInfo.dossierId(), "Incorrect dossierId");
assertEquals("fileId", storageInfo.fileId(), "Incorrect fileId");
assertEquals("DOCUMENT_STRUCTURE", storageInfo.fileTypeName(), "Incorrect fileTypeName");
assertEquals("json", storageInfo.fileTypeExtension(), "Incorrect fileTypeExtension");
}
@Test
void testParseStorageId_MissingFileTypeExtension() {
Exception exception = assertThrows(IllegalArgumentException.class, () ->
StorageIdUtils.parseStorageId("dossierId/fileId.DOCUMENT_STRUCTURE")
);
assertEquals("Invalid storageId format", exception.getMessage());
}
@Test
void testParseStorageId_InvalidFormat() {
Exception exception = assertThrows(IllegalArgumentException.class, () ->
StorageIdUtils.parseStorageId("invalidFormat")
);
assertEquals("Invalid storageId format", exception.getMessage());
}
@Test
void testParseStorageId_NoDotsInFilePart() {
Exception exception = assertThrows(IllegalArgumentException.class, () ->
StorageIdUtils.parseStorageId("dossierId/fileId")
);
assertEquals("Invalid storageId format", exception.getMessage());
}
}