From d40f4f3289e80bf47e7f60440aabfbedd5b87943 Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Mon, 7 Oct 2024 13:10:27 +0200 Subject: [PATCH] RED-9123: Protobuf serialization of document data files --- .../llm-service-processor/build.gradle.kts | 9 +- .../llm/service/document/DocumentData.java | 23 +- .../service/document/DocumentGraphMapper.java | 110 ++++---- .../service/document/PropertiesMapper.java | 34 +-- .../document/nodes/AbstractSemanticNode.java | 4 +- .../llm/service/document/nodes/Page.java | 33 ++- .../service/document/nodes/SemanticNode.java | 20 +- .../llm/service/document/nodes/Table.java | 4 +- .../document/textblock/AtomicTextBlock.java | 24 +- .../llm/service/services/LlmNerService.java | 243 +++++++++++++++++- .../llm/service/utils/StorageIdUtils.java | 48 ++++ .../llm-service-server/build.gradle.kts | 6 +- .../llm/service/StorageIdUtilsTest.java | 45 ++++ 13 files changed, 485 insertions(+), 118 deletions(-) create mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/utils/StorageIdUtils.java create mode 100644 llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/StorageIdUtilsTest.java diff --git a/llm-service/llm-service-processor/build.gradle.kts b/llm-service/llm-service-processor/build.gradle.kts index 21d7771..39cc756 100644 --- a/llm-service/llm-service-processor/build.gradle.kts +++ b/llm-service/llm-service-processor/build.gradle.kts @@ -13,10 +13,13 @@ extra["testcontainersVersion"] = "1.20.0" dependencies { implementation(project(":llm-service-api")) - implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.159.0") - implementation("com.iqser.red.commons:storage-commons:2.49.0") + implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.181.0") + implementation("com.iqser.red.commons:storage-commons:2.50.0") implementation("org.springframework.boot:spring-boot-starter:3.1.1") - implementation("com.knecon.fforesight:tenant-commons:0.30.0") + implementation("com.knecon.fforesight:tenant-commons:0.30.0") { + exclude(group = "com.iqser.red.commons", module = "storage-commons") + } implementation("com.azure:azure-ai-openai:1.0.0-beta.10") implementation("ch.qos.logback:logback-classic:1.5.7") + implementation("com.google.protobuf:protobuf-java:4.27.1") } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java index 3787618..744836c 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java @@ -2,10 +2,11 @@ package com.knecon.fforesight.llm.service.document; import java.io.Serializable; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -21,9 +22,15 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class DocumentData implements Serializable { - DocumentPage[] documentPages; - DocumentTextData[] documentTextData; - DocumentPositionData[] documentPositionData; - DocumentStructure documentStructure; + DocumentPageProto.AllDocumentPages documentPages; + DocumentTextDataProto.AllDocumentTextData documentTextData; + DocumentPositionDataProto.AllDocumentPositionData documentPositionData; + DocumentStructureWrapper documentStructureWrapper; + + + public DocumentStructureProto.DocumentStructure getDocumentStructure() { + + return documentStructureWrapper.getDocumentStructure(); + } } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java index 87627bf..356ba1a 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java @@ -1,9 +1,11 @@ package com.knecon.fforesight.llm.service.document; +import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage; +import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData; +import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData; +import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; @@ -25,10 +27,6 @@ import com.knecon.fforesight.llm.service.document.nodes.TableCell; import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import lombok.experimental.UtilityClass; @@ -41,28 +39,30 @@ public class DocumentGraphMapper { DocumentTree documentTree = new DocumentTree(document); Context context = new Context(documentData, documentTree); - context.pageData.addAll(Arrays.stream(documentData.getDocumentPages()) + context.pageData.addAll(documentData.getDocumentPages().getDocumentPagesList() + .stream() .map(DocumentGraphMapper::buildPage) .toList()); - context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context)); + context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context)); document.setDocumentTree(context.documentTree); document.setPages(new HashSet<>(context.pageData)); - document.setNumberOfPages(documentData.getDocumentPages().length); + document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount()); document.setTextBlock(document.getTextBlock()); return document; } - private List buildEntries(List entries, Context context) { + private List buildEntries(List entries, Context context) { List newEntries = new ArrayList<>(entries.size()); - for (DocumentStructure.EntryData entryData : entries) { + for (EntryData entryData : entries) { - List pages = Arrays.stream(entryData.getPageNumbers()) - .map(pageNumber -> getPage(pageNumber, context)) + List pages = entryData.getPageNumbersList() + .stream() + .map(context::getPage) .toList(); SemanticNode node = switch (entryData.getType()) { @@ -74,33 +74,30 @@ public class DocumentGraphMapper { case FOOTER -> buildFooter(context); case TABLE -> buildTable(context, entryData.getProperties()); case TABLE_CELL -> buildTableCell(context, entryData.getProperties()); - case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers()); + case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList()); default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType()); }; - if (entryData.getAtomicBlockIds().length > 0) { - TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node); + if (entryData.getAtomicBlockIdsCount() > 0) { + TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node); node.setLeafTextBlock(textBlock); + + switch (entryData.getType()) { + case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); + case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); + case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node)); + default -> textBlock.getAtomicTextBlocks() + .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb)); + } + } - List treeId = Arrays.stream(entryData.getTreeId()).boxed() - .toList(); - if (entryData.getEngines() != null) { - entryData.getEngines() - .forEach(node::addEngine); - } else { - entryData.setEngines(Collections.emptySet()); - } + List treeId = entryData.getTreeIdList(); + entryData.getEnginesList() + .forEach(node::addEngine); node.setTreeId(treeId); - switch (entryData.getType()) { - case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); - case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); - default -> pages.forEach(page -> page.getMainBody().add(node)); - } - - newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build()); - } - return newEntries; + newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build()); + } return newEntries; } @@ -110,10 +107,10 @@ public class DocumentGraphMapper { } - private Image buildImage(Context context, Map properties, Long[] pageNumbers) { + private Image buildImage(Context context, Map properties, List pageNumbers) { - assert pageNumbers.length == 1; - Page page = getPage(pageNumbers[0], context); + assert pageNumbers.size() == 1; + Page page = context.getPage(pageNumbers.get(0)); var builder = Image.builder(); PropertiesMapper.parseImageProperties(properties, builder); return builder.documentTree(context.documentTree).page(page).build(); @@ -159,13 +156,14 @@ public class DocumentGraphMapper { return SuperSection.builder().documentTree(context.documentTree).build(); } + private Paragraph buildParagraph(Context context, Map properties) { if (PropertiesMapper.isDuplicateParagraph(properties)) { DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build(); - Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties); + var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties); duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph)); return duplicatedParagraph; } @@ -174,9 +172,9 @@ public class DocumentGraphMapper { } - private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { + private TextBlock toTextBlock(List atomicTextBlockIds, Context context, SemanticNode parent) { - return Arrays.stream(atomicTextBlockIds) + return atomicTextBlockIds.stream() .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)) .collect(new TextBlockCollector()); } @@ -184,24 +182,16 @@ public class DocumentGraphMapper { private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { - return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)), - context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)), + return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)), + context.documentPositionData.getDocumentPositionData(Math.toIntExact(atomicTextBlockId)), parent, - getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); + context.getPage(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)).getPage())); } private Page buildPage(DocumentPage p) { - return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build(); - } - - - private Page getPage(Long pageIndex, Context context) { - - Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1); - assert page.getNumber() == Math.toIntExact(pageIndex); - return page; + return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build(); } @@ -209,21 +199,27 @@ public class DocumentGraphMapper { private final DocumentTree documentTree; private final List pageData; - private final List documentTextData; - private final List documentPositionData; + private final AllDocumentTextData documentTextData; + private final AllDocumentPositionData documentPositionData; Context(DocumentData documentData, DocumentTree documentTree) { this.documentTree = documentTree; this.pageData = new ArrayList<>(); - this.documentTextData = Arrays.stream(documentData.getDocumentTextData()) - .toList(); - this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData()) - .toList(); + this.documentTextData = documentData.getDocumentTextData(); + this.documentPositionData = documentData.getDocumentPositionData(); } + + private Page getPage(Long pageIndex) { + + Page page = pageData.get(Math.toIntExact(pageIndex) - 1); + assert page.getNumber() == Math.toIntExact(pageIndex); + return page; + } + } } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java index 9d7acfe..60f9d78 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java @@ -9,7 +9,7 @@ import com.knecon.fforesight.llm.service.document.nodes.Image; import com.knecon.fforesight.llm.service.document.nodes.ImageType; import com.knecon.fforesight.llm.service.document.nodes.Table; import com.knecon.fforesight.llm.service.document.nodes.TableCell; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper; import lombok.experimental.UtilityClass; @@ -18,32 +18,32 @@ public class PropertiesMapper { public void parseImageProperties(Map properties, Image.ImageBuilder builder) { - builder.imageType(ImageType.fromString(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE))); - builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT))); - builder.position(parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION))); - builder.id(properties.get(DocumentStructure.ImageProperties.ID)); + builder.imageType(ImageType.fromString(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE))); + builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT))); + builder.position(parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION))); + builder.id(properties.get(DocumentStructureWrapper.ImageProperties.ID)); } public void parseTableCellProperties(Map properties, TableCell.TableCellBuilder builder) { - builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW))); - builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL))); - builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER))); - builder.bBox(parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX))); + builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW))); + builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL))); + builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER))); + builder.bBox(parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX))); } public void parseTableProperties(Map properties, Table.TableBuilder builder) { - builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS))); - builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS))); + builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS))); + builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS))); } private Rectangle2D parseRectangle2D(String bBox) { - List floats = Arrays.stream(bBox.split(DocumentStructure.RECTANGLE_DELIMITER)) + List floats = Arrays.stream(bBox.split(DocumentStructureWrapper.RECTANGLE_DELIMITER)) .map(Float::parseFloat) .toList(); return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); @@ -52,21 +52,21 @@ public class PropertiesMapper { public static boolean isDuplicateParagraph(Map properties) { - return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID); + return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID); } - public static Long[] getUnsortedTextblockIds(Map properties) { + public static List getUnsortedTextblockIds(Map properties) { - return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID)); + return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID)); } - public static Long[] toLongArray(String ids) { + public static List toLongList(String ids) { return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")) .map(Long::valueOf) - .toArray(Long[]::new); + .toList(); } } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java index 8a0e26b..df2737e 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java @@ -9,7 +9,7 @@ import java.util.Set; import com.knecon.fforesight.llm.service.document.DocumentTree; import com.knecon.fforesight.llm.service.document.entity.TextEntity; import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -31,7 +31,7 @@ import lombok.extern.slf4j.Slf4j; public abstract class AbstractSemanticNode implements GenericSemanticNode { @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); + Set engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java index 0624629..5911148 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java @@ -5,6 +5,7 @@ import java.util.List; import java.util.Set; import com.knecon.fforesight.llm.service.document.entity.TextEntity; +import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; @@ -35,7 +36,8 @@ public class Page { Integer width; Integer rotation; - List mainBody; + + List textBlocksOnPage; Header header; Footer footer; @@ -53,13 +55,36 @@ public class Page { */ public TextBlock getMainBodyTextBlock() { - return mainBody.stream() - .filter(SemanticNode::isLeaf) - .map(SemanticNode::getTextBlock) + return textBlocksOnPage.stream() + .filter(atb -> !atb.isEmpty()) .collect(new TextBlockCollector()); } + /** + * Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page. + * + * @return A list which contains the highes SemanticNodes, which appear only on this page. + */ + public List getMainBody() { + + return textBlocksOnPage.stream() + .map(AtomicTextBlock::getParent) + .map(this::getHighestParentOnlyOnPage) + .distinct() + .toList(); + } + + private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) { + + SemanticNode currentNode = node; + while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) { + currentNode = currentNode.getParent(); + } + return currentNode; + } + + @Override public String toString() { diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java index b051fac..c525738 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java @@ -21,7 +21,7 @@ import com.knecon.fforesight.llm.service.document.entity.TextEntity; import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto; public interface SemanticNode { @@ -287,11 +287,10 @@ public interface SemanticNode { return getTextBlock().getSearchText().contains(string); } - - Set getEngines(); + Set getEngines(); - default void addEngine(LayoutEngine engine) { + default void addEngine(LayoutEngineProto.LayoutEngine engine) { getEngines().add(engine); } @@ -669,4 +668,17 @@ public interface SemanticNode { return bBoxPerPage; } + + /** + * Checks wether this SemanticNode appears on a single page only, and if that page is the provided one. + * + * @param page the page to check + * @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false. + */ + default boolean onlyOnPage(Page page) { + + Set pages = getPages(); + return pages.size() == 1 && pages.contains(page); + } + } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java index 89df017..80831f4 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java @@ -16,7 +16,7 @@ import com.knecon.fforesight.llm.service.document.DocumentTree; import com.knecon.fforesight.llm.service.document.entity.TextEntity; import com.knecon.fforesight.llm.service.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -36,7 +36,7 @@ import lombok.experimental.FieldDefaults; public class Table implements SemanticNode { @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); + Set engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM)); @EqualsAndHashCode.Include List treeId; DocumentTree documentTree; diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java index 3b9a1bb..8151869 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java @@ -5,7 +5,6 @@ import static java.lang.String.format; import java.awt.geom.Rectangle2D; import java.text.BreakIterator; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -19,8 +18,8 @@ import com.knecon.fforesight.llm.service.document.RectangleTransformations; import com.knecon.fforesight.llm.service.document.TextRange; import com.knecon.fforesight.llm.service.document.nodes.Page; import com.knecon.fforesight.llm.service.document.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -78,7 +77,10 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData atomicTextBlockData, DocumentPositionData atomicPositionBlockData, SemanticNode parent, Page page) { + public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextDataProto.DocumentTextData atomicTextBlockData, + DocumentPositionDataProto.DocumentPositionData atomicPositionBlockData, + SemanticNode parent, + Page page) { return AtomicTextBlock.builder() .id(atomicTextBlockData.getId()) @@ -86,20 +88,18 @@ public class AtomicTextBlock implements TextBlock { .page(page) .textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd())) .searchText(atomicTextBlockData.getSearchText()) - .lineBreaks(Arrays.stream(atomicTextBlockData.getLineBreaks()).boxed() - .toList()) - .stringIdxToPositionIdx(Arrays.stream(atomicPositionBlockData.getStringIdxToPositionIdx()).boxed() - .toList()) - .positions(toRectangle2DList(atomicPositionBlockData.getPositions())) + .lineBreaks(atomicTextBlockData.getLineBreaksList()) + .stringIdxToPositionIdx(atomicPositionBlockData.getStringIdxToPositionIdxList()) + .positions(toRectangle2DList(atomicPositionBlockData.getPositionsList())) .parent(parent) .build(); } - private static List toRectangle2DList(float[][] positions) { + private static List toRectangle2DList(List positions) { - return Arrays.stream(positions) - .map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])) + return positions.stream() + .map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3))) .toList(); } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java index a8defcf..92b518f 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java @@ -1,7 +1,14 @@ package com.knecon.fforesight.llm.service.services; +import java.io.BufferedInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; import java.util.ArrayList; -import java.util.Collections; +import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -24,11 +31,14 @@ import com.azure.ai.openai.models.CompletionsUsage; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.primitives.Floats; +import com.iqser.red.storage.commons.exception.StorageException; import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.llm.service.ChunkingResponse; import com.knecon.fforesight.llm.service.LlmNerEntities; import com.knecon.fforesight.llm.service.LlmNerEntity; import com.knecon.fforesight.llm.service.LlmNerMessage; +import com.knecon.fforesight.llm.service.LlmServiceSettings; import com.knecon.fforesight.llm.service.SystemMessages; import com.knecon.fforesight.llm.service.document.DocumentData; import com.knecon.fforesight.llm.service.document.DocumentGraphMapper; @@ -36,10 +46,18 @@ import com.knecon.fforesight.llm.service.document.nodes.Document; import com.knecon.fforesight.llm.service.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.models.Chunk; import com.knecon.fforesight.llm.service.utils.FormattingUtils; +import com.knecon.fforesight.llm.service.utils.StorageIdUtils; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto; import com.knecon.fforesight.tenantcommons.TenantContext; import lombok.AccessLevel; @@ -217,16 +235,227 @@ public class LlmNerService { private Document buildDocument(LlmNerMessage llmNerMessage) { DocumentData documentData = new DocumentData(); - documentData.setDocumentStructure(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentStructureStorageId(), DocumentStructure.class)); - documentData.setDocumentTextData(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentTextStorageId(), DocumentTextData[].class)); - documentData.setDocumentPositionData(storageService.readJSONObject(TenantContext.getTenantId(), - llmNerMessage.getDocumentPositionStorageId(), - DocumentPositionData[].class)); - documentData.setDocumentPages(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentPagesStorageId(), DocumentPage[].class)); + documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(fetchDocumentStructure(llmNerMessage.getDocumentStructureStorageId()))); + documentData.setDocumentTextData(fetchDocumentTextData(llmNerMessage.getDocumentTextStorageId())); + documentData.setDocumentPositionData(fetchDocumentPositionData(llmNerMessage.getDocumentPositionStorageId())); + documentData.setDocumentPages(fetchAllDocumentPages(llmNerMessage.getDocumentPagesStorageId())); return DocumentGraphMapper.toDocumentGraph(documentData); } + private DocumentStructureProto.DocumentStructure fetchDocumentStructure(String storageId) { + + DocumentStructureProto.DocumentStructure documentStructure; + StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); + + if (storageInfo.fileTypeExtension().contains("proto")) { + documentStructure = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentStructureProto.DocumentStructure.parser()); + } else { + DocumentStructure oldDocumentStructure = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentStructure.class); + if (oldDocumentStructure == null) { + return null; + } + documentStructure = convertDocumentStructure(oldDocumentStructure); + } + + return documentStructure; + } + + + private DocumentTextDataProto.AllDocumentTextData fetchDocumentTextData(String storageId) { + + DocumentTextDataProto.AllDocumentTextData documentTextData; + StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); + + if (storageInfo.fileTypeExtension().contains("proto")) { + documentTextData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentTextDataProto.AllDocumentTextData.parser()); + } else { + DocumentTextData[] oldDocumentTextData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentTextData[].class); + if (oldDocumentTextData == null) { + return null; + } + documentTextData = convertAllDocumentTextData(oldDocumentTextData); + } + + return documentTextData; + } + + + private DocumentPositionDataProto.AllDocumentPositionData fetchDocumentPositionData(String storageId) { + + DocumentPositionDataProto.AllDocumentPositionData documentPositionData; + StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); + + if (storageInfo.fileTypeExtension().contains("proto")) { + documentPositionData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPositionDataProto.AllDocumentPositionData.parser()); + } else { + DocumentPositionData[] oldDocumentPositionData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPositionData[].class); + if (oldDocumentPositionData == null) { + return null; + } + documentPositionData = convertAllDocumentPositionData(oldDocumentPositionData); + } + + return documentPositionData; + } + + + private DocumentPageProto.AllDocumentPages fetchAllDocumentPages(String storageId) { + + DocumentPageProto.AllDocumentPages allDocumentPages; + StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); + + if (storageInfo.fileTypeExtension().contains("proto")) { + allDocumentPages = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPageProto.AllDocumentPages.parser()); + } else { + DocumentPage[] oldDocumentPages = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPage[].class); + if (oldDocumentPages == null) { + return null; + } + allDocumentPages = convertAllDocumentPages(oldDocumentPages); + } + + return allDocumentPages; + } + + + private T getOldData(String dossierId, String fileId, String fileType, Class valueType) { + + String oldStorageId = StorageIdUtils.getStorageId(dossierId, fileId, fileType, ".json"); + System.out.println("----------------> LOOKING FOR " + oldStorageId); + try (InputStream inputStream = getObject(TenantContext.getTenantId(), oldStorageId)) { + return mapper.readValue(inputStream, valueType); + } catch (IOException e) { + log.error("Could not read JSON for " + fileType + ", error was: " + e); + return null; + } + } + + + private static EntryDataProto.EntryData convertEntryData(DocumentStructure.EntryData oldEntryData) { + + EntryDataProto.EntryData.Builder builder = EntryDataProto.EntryData.newBuilder(); + + builder.setType(NodeTypeProto.NodeType.valueOf(oldEntryData.getType().name())); + builder.addAllTreeId(Arrays.stream(oldEntryData.getTreeId()).boxed() + .collect(Collectors.toList())); + builder.addAllAtomicBlockIds(Arrays.asList(oldEntryData.getAtomicBlockIds())); + builder.addAllPageNumbers(Arrays.asList(oldEntryData.getPageNumbers())); + + builder.putAllProperties(oldEntryData.getProperties()); + + if (oldEntryData.getChildren() != null) { + oldEntryData.getChildren() + .forEach(child -> builder.addChildren(convertEntryData(child))); + } + + return builder.build(); + } + + + private static DocumentStructureProto.DocumentStructure convertDocumentStructure(DocumentStructure oldStructure) { + + DocumentStructureProto.DocumentStructure.Builder newBuilder = DocumentStructureProto.DocumentStructure.newBuilder(); + + if (oldStructure.getRoot() != null) { + newBuilder.setRoot(convertEntryData(oldStructure.getRoot())); + } + + return newBuilder.build(); + } + + + private static DocumentPageProto.DocumentPage convertDocumentPage(DocumentPage oldPage) { + + return DocumentPageProto.DocumentPage.newBuilder() + .setNumber(oldPage.getNumber()) + .setHeight(oldPage.getHeight()) + .setWidth(oldPage.getWidth()) + .setRotation(oldPage.getRotation()) + .build(); + } + + + private static DocumentPageProto.AllDocumentPages convertAllDocumentPages(DocumentPage[] oldPages) { + + DocumentPageProto.AllDocumentPages.Builder allPagesBuilder = DocumentPageProto.AllDocumentPages.newBuilder(); + + for (DocumentPage oldPage : oldPages) { + DocumentPageProto.DocumentPage newPage = convertDocumentPage(oldPage); + allPagesBuilder.addDocumentPages(newPage); + } + + return allPagesBuilder.build(); + } + + + private static DocumentPositionDataProto.DocumentPositionData convertDocumentPositionData(DocumentPositionData oldData) { + + DocumentPositionDataProto.DocumentPositionData.Builder builder = DocumentPositionDataProto.DocumentPositionData.newBuilder() + .setId(oldData.getId()) + .addAllStringIdxToPositionIdx(Arrays.stream(oldData.getStringIdxToPositionIdx()).boxed() + .collect(Collectors.toList())); + + for (float[] pos : oldData.getPositions()) { + DocumentPositionDataProto.DocumentPositionData.Position position = DocumentPositionDataProto.DocumentPositionData.Position.newBuilder() + .addAllValue(Floats.asList(pos)) + .build(); + builder.addPositions(position); + } + + return builder.build(); + } + + + private static DocumentPositionDataProto.AllDocumentPositionData convertAllDocumentPositionData(DocumentPositionData[] oldDataList) { + + DocumentPositionDataProto.AllDocumentPositionData.Builder allDataBuilder = DocumentPositionDataProto.AllDocumentPositionData.newBuilder(); + + for (DocumentPositionData oldData : oldDataList) { + allDataBuilder.addDocumentPositionData(convertDocumentPositionData(oldData)); + } + + return allDataBuilder.build(); + } + + + private static DocumentTextDataProto.DocumentTextData convertDocumentTextData(DocumentTextData oldData) { + + DocumentTextDataProto.DocumentTextData.Builder builder = DocumentTextDataProto.DocumentTextData.newBuilder() + .setId(oldData.getId()) + .setPage(oldData.getPage()) + .setSearchText(oldData.getSearchText()) + .setNumberOnPage(oldData.getNumberOnPage()) + .setStart(oldData.getStart()) + .setEnd(oldData.getEnd()) + .addAllLineBreaks(Arrays.stream(oldData.getLineBreaks()).boxed() + .collect(Collectors.toList())); + + return builder.build(); + } + + + private static DocumentTextDataProto.AllDocumentTextData convertAllDocumentTextData(DocumentTextData[] oldDataList) { + + DocumentTextDataProto.AllDocumentTextData.Builder allDataBuilder = DocumentTextDataProto.AllDocumentTextData.newBuilder(); + + for (DocumentTextData oldData : oldDataList) { + allDataBuilder.addDocumentTextData(convertDocumentTextData(oldData)); + } + + return allDataBuilder.build(); + } + + + @SneakyThrows + private InputStream getObject(String tenantId, String storageId) { + + File tempFile = File.createTempFile("temp", ".data"); + storageService.downloadTo(tenantId, storageId, tempFile); + return new BufferedInputStream(Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE)); + } + + private record EntitiesWithUsage(List entities, CompletionsUsage completionsUsage) { } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/utils/StorageIdUtils.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/utils/StorageIdUtils.java new file mode 100644 index 0000000..c7f0100 --- /dev/null +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/utils/StorageIdUtils.java @@ -0,0 +1,48 @@ +package com.knecon.fforesight.llm.service.utils; + +import java.util.Arrays; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class StorageIdUtils { + + public static final String INVALID_STORAGE_ID_FORMAT = "Invalid storageId format"; + + + public String getStorageId(String dossierId, String fileId, String fileName, String fileExtension) { + + return dossierId + "/" + fileId + "." + fileName + fileExtension; + } + + + public static StorageInfo parseStorageId(String storageId) { + + String[] parts = storageId.split("/", 2); + + if (parts.length < 2) { + throw new IllegalArgumentException(INVALID_STORAGE_ID_FORMAT); + } + + String dossierId = parts[0]; + String fileAndType = parts[1]; + + String[] fileParts = fileAndType.split("\\."); + + if (fileParts.length < 3) { + throw new IllegalArgumentException(INVALID_STORAGE_ID_FORMAT); + } + + String fileId = fileParts[0]; + String fileTypeExtension = fileParts[fileParts.length - 1]; + String fileTypeName = String.join(".", Arrays.copyOfRange(fileParts, 1, fileParts.length - 1)); + + return new StorageInfo(dossierId, fileId, fileTypeName, fileTypeExtension); + } + + + public record StorageInfo(String dossierId, String fileId, String fileTypeName, String fileTypeExtension) { + + } + +} diff --git a/llm-service/llm-service-server/build.gradle.kts b/llm-service/llm-service-server/build.gradle.kts index b4071a2..2bd9321 100644 --- a/llm-service/llm-service-server/build.gradle.kts +++ b/llm-service/llm-service-server/build.gradle.kts @@ -32,8 +32,10 @@ dependencies { implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.3") implementation("org.springframework.boot:spring-boot-starter-websocket:$springBootVersion") implementation("org.springframework.security:spring-security-messaging:$springSecurityVersion") - implementation("com.iqser.red.commons:storage-commons:2.49.0") - implementation("com.knecon.fforesight:keycloak-commons:0.30.0") + implementation("com.iqser.red.commons:storage-commons:2.50.0") + implementation("com.knecon.fforesight:keycloak-commons:0.30.0") { + exclude(group = "com.knecon.fforesight", module = "tenant-commons") + } implementation("com.knecon.fforesight:tenant-commons:0.30.0") implementation("com.knecon.fforesight:swagger-commons:0.7.0") implementation("ch.qos.logback:logback-classic") diff --git a/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/StorageIdUtilsTest.java b/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/StorageIdUtilsTest.java new file mode 100644 index 0000000..50d8dbf --- /dev/null +++ b/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/StorageIdUtilsTest.java @@ -0,0 +1,45 @@ +package com.knecon.fforesight.llm.service; + + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.llm.service.utils.StorageIdUtils; + +public class StorageIdUtilsTest { + + @Test + void testParseStorageId_ValidInput() { + StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId("dossierId/fileId.DOCUMENT_STRUCTURE.json"); + assertEquals("dossierId", storageInfo.dossierId(), "Incorrect dossierId"); + assertEquals("fileId", storageInfo.fileId(), "Incorrect fileId"); + assertEquals("DOCUMENT_STRUCTURE", storageInfo.fileTypeName(), "Incorrect fileTypeName"); + assertEquals("json", storageInfo.fileTypeExtension(), "Incorrect fileTypeExtension"); + } + + @Test + void testParseStorageId_MissingFileTypeExtension() { + Exception exception = assertThrows(IllegalArgumentException.class, () -> + StorageIdUtils.parseStorageId("dossierId/fileId.DOCUMENT_STRUCTURE") + ); + assertEquals("Invalid storageId format", exception.getMessage()); + } + + @Test + void testParseStorageId_InvalidFormat() { + Exception exception = assertThrows(IllegalArgumentException.class, () -> + StorageIdUtils.parseStorageId("invalidFormat") + ); + assertEquals("Invalid storageId format", exception.getMessage()); + } + + @Test + void testParseStorageId_NoDotsInFilePart() { + Exception exception = assertThrows(IllegalArgumentException.class, () -> + StorageIdUtils.parseStorageId("dossierId/fileId") + ); + assertEquals("Invalid storageId format", exception.getMessage()); + } +}