Merge branch 'RED-9123' into 'main'
RED-9123: Protobuf serialization of document data files See merge request fforesight/llm-service!24
This commit is contained in:
commit
2d66b1e5d4
@ -13,10 +13,13 @@ extra["testcontainersVersion"] = "1.20.0"
|
|||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation(project(":llm-service-api"))
|
implementation(project(":llm-service-api"))
|
||||||
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.159.0")
|
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.181.0")
|
||||||
implementation("com.iqser.red.commons:storage-commons:2.49.0")
|
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||||
implementation("org.springframework.boot:spring-boot-starter:3.1.1")
|
implementation("org.springframework.boot:spring-boot-starter:3.1.1")
|
||||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
|
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
|
||||||
|
exclude(group = "com.iqser.red.commons", module = "storage-commons")
|
||||||
|
}
|
||||||
implementation("com.azure:azure-ai-openai:1.0.0-beta.10")
|
implementation("com.azure:azure-ai-openai:1.0.0-beta.10")
|
||||||
implementation("ch.qos.logback:logback-classic:1.5.7")
|
implementation("ch.qos.logback:logback-classic:1.5.7")
|
||||||
|
implementation("com.google.protobuf:protobuf-java:4.27.1")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,10 +2,11 @@ package com.knecon.fforesight.llm.service.document;
|
|||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -21,9 +22,15 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
public class DocumentData implements Serializable {
|
public class DocumentData implements Serializable {
|
||||||
|
|
||||||
DocumentPage[] documentPages;
|
DocumentPageProto.AllDocumentPages documentPages;
|
||||||
DocumentTextData[] documentTextData;
|
DocumentTextDataProto.AllDocumentTextData documentTextData;
|
||||||
DocumentPositionData[] documentPositionData;
|
DocumentPositionDataProto.AllDocumentPositionData documentPositionData;
|
||||||
DocumentStructure documentStructure;
|
DocumentStructureWrapper documentStructureWrapper;
|
||||||
|
|
||||||
|
|
||||||
|
public DocumentStructureProto.DocumentStructure getDocumentStructure() {
|
||||||
|
|
||||||
|
return documentStructureWrapper.getDocumentStructure();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,9 +1,11 @@
|
|||||||
package com.knecon.fforesight.llm.service.document;
|
package com.knecon.fforesight.llm.service.document;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -25,10 +27,6 @@ import com.knecon.fforesight.llm.service.document.nodes.TableCell;
|
|||||||
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@ -41,28 +39,30 @@ public class DocumentGraphMapper {
|
|||||||
DocumentTree documentTree = new DocumentTree(document);
|
DocumentTree documentTree = new DocumentTree(document);
|
||||||
Context context = new Context(documentData, documentTree);
|
Context context = new Context(documentData, documentTree);
|
||||||
|
|
||||||
context.pageData.addAll(Arrays.stream(documentData.getDocumentPages())
|
context.pageData.addAll(documentData.getDocumentPages().getDocumentPagesList()
|
||||||
|
.stream()
|
||||||
.map(DocumentGraphMapper::buildPage)
|
.map(DocumentGraphMapper::buildPage)
|
||||||
.toList());
|
.toList());
|
||||||
|
|
||||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
|
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context));
|
||||||
|
|
||||||
document.setDocumentTree(context.documentTree);
|
document.setDocumentTree(context.documentTree);
|
||||||
document.setPages(new HashSet<>(context.pageData));
|
document.setPages(new HashSet<>(context.pageData));
|
||||||
document.setNumberOfPages(documentData.getDocumentPages().length);
|
document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount());
|
||||||
|
|
||||||
document.setTextBlock(document.getTextBlock());
|
document.setTextBlock(document.getTextBlock());
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
|
private List<DocumentTree.Entry> buildEntries(List<EntryData> entries, Context context) {
|
||||||
|
|
||||||
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
|
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
|
||||||
for (DocumentStructure.EntryData entryData : entries) {
|
for (EntryData entryData : entries) {
|
||||||
|
|
||||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
List<Page> pages = entryData.getPageNumbersList()
|
||||||
.map(pageNumber -> getPage(pageNumber, context))
|
.stream()
|
||||||
|
.map(context::getPage)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
SemanticNode node = switch (entryData.getType()) {
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
@ -74,33 +74,30 @@ public class DocumentGraphMapper {
|
|||||||
case FOOTER -> buildFooter(context);
|
case FOOTER -> buildFooter(context);
|
||||||
case TABLE -> buildTable(context, entryData.getProperties());
|
case TABLE -> buildTable(context, entryData.getProperties());
|
||||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
||||||
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
|
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList());
|
||||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||||
};
|
};
|
||||||
|
|
||||||
if (entryData.getAtomicBlockIds().length > 0) {
|
if (entryData.getAtomicBlockIdsCount() > 0) {
|
||||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node);
|
||||||
node.setLeafTextBlock(textBlock);
|
node.setLeafTextBlock(textBlock);
|
||||||
|
|
||||||
|
switch (entryData.getType()) {
|
||||||
|
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||||
|
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||||
|
case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node));
|
||||||
|
default -> textBlock.getAtomicTextBlocks()
|
||||||
|
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
List<Integer> treeId = entryData.getTreeIdList();
|
||||||
.toList();
|
entryData.getEnginesList()
|
||||||
if (entryData.getEngines() != null) {
|
.forEach(node::addEngine);
|
||||||
entryData.getEngines()
|
|
||||||
.forEach(node::addEngine);
|
|
||||||
} else {
|
|
||||||
entryData.setEngines(Collections.emptySet());
|
|
||||||
}
|
|
||||||
node.setTreeId(treeId);
|
node.setTreeId(treeId);
|
||||||
|
|
||||||
switch (entryData.getType()) {
|
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build());
|
||||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
} return newEntries;
|
||||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
|
||||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
|
||||||
}
|
|
||||||
|
|
||||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
|
||||||
}
|
|
||||||
return newEntries;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -110,10 +107,10 @@ public class DocumentGraphMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
private Image buildImage(Context context, Map<String, String> properties, List<Long> pageNumbers) {
|
||||||
|
|
||||||
assert pageNumbers.length == 1;
|
assert pageNumbers.size() == 1;
|
||||||
Page page = getPage(pageNumbers[0], context);
|
Page page = context.getPage(pageNumbers.get(0));
|
||||||
var builder = Image.builder();
|
var builder = Image.builder();
|
||||||
PropertiesMapper.parseImageProperties(properties, builder);
|
PropertiesMapper.parseImageProperties(properties, builder);
|
||||||
return builder.documentTree(context.documentTree).page(page).build();
|
return builder.documentTree(context.documentTree).page(page).build();
|
||||||
@ -159,13 +156,14 @@ public class DocumentGraphMapper {
|
|||||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||||
|
|
||||||
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||||
|
|
||||||
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
||||||
|
|
||||||
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||||
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
||||||
return duplicatedParagraph;
|
return duplicatedParagraph;
|
||||||
}
|
}
|
||||||
@ -174,9 +172,9 @@ public class DocumentGraphMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
private TextBlock toTextBlock(List<Long> atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||||
|
|
||||||
return Arrays.stream(atomicTextBlockIds)
|
return atomicTextBlockIds.stream()
|
||||||
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||||
.collect(new TextBlockCollector());
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
@ -184,24 +182,16 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||||
|
|
||||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
|
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)),
|
||||||
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
|
context.documentPositionData.getDocumentPositionData(Math.toIntExact(atomicTextBlockId)),
|
||||||
parent,
|
parent,
|
||||||
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
context.getPage(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)).getPage()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Page buildPage(DocumentPage p) {
|
private Page buildPage(DocumentPage p) {
|
||||||
|
|
||||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Page getPage(Long pageIndex, Context context) {
|
|
||||||
|
|
||||||
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
|
|
||||||
assert page.getNumber() == Math.toIntExact(pageIndex);
|
|
||||||
return page;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -209,21 +199,27 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
private final DocumentTree documentTree;
|
private final DocumentTree documentTree;
|
||||||
private final List<Page> pageData;
|
private final List<Page> pageData;
|
||||||
private final List<DocumentTextData> documentTextData;
|
private final AllDocumentTextData documentTextData;
|
||||||
private final List<DocumentPositionData> documentPositionData;
|
private final AllDocumentPositionData documentPositionData;
|
||||||
|
|
||||||
|
|
||||||
Context(DocumentData documentData, DocumentTree documentTree) {
|
Context(DocumentData documentData, DocumentTree documentTree) {
|
||||||
|
|
||||||
this.documentTree = documentTree;
|
this.documentTree = documentTree;
|
||||||
this.pageData = new ArrayList<>();
|
this.pageData = new ArrayList<>();
|
||||||
this.documentTextData = Arrays.stream(documentData.getDocumentTextData())
|
this.documentTextData = documentData.getDocumentTextData();
|
||||||
.toList();
|
this.documentPositionData = documentData.getDocumentPositionData();
|
||||||
this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData())
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Page getPage(Long pageIndex) {
|
||||||
|
|
||||||
|
Page page = pageData.get(Math.toIntExact(pageIndex) - 1);
|
||||||
|
assert page.getNumber() == Math.toIntExact(pageIndex);
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import com.knecon.fforesight.llm.service.document.nodes.Image;
|
|||||||
import com.knecon.fforesight.llm.service.document.nodes.ImageType;
|
import com.knecon.fforesight.llm.service.document.nodes.ImageType;
|
||||||
import com.knecon.fforesight.llm.service.document.nodes.Table;
|
import com.knecon.fforesight.llm.service.document.nodes.Table;
|
||||||
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
|
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@ -18,32 +18,32 @@ public class PropertiesMapper {
|
|||||||
|
|
||||||
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
|
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
|
||||||
|
|
||||||
builder.imageType(ImageType.fromString(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE)));
|
builder.imageType(ImageType.fromString(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE)));
|
||||||
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT)));
|
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT)));
|
||||||
builder.position(parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION)));
|
builder.position(parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION)));
|
||||||
builder.id(properties.get(DocumentStructure.ImageProperties.ID));
|
builder.id(properties.get(DocumentStructureWrapper.ImageProperties.ID));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
|
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
|
||||||
|
|
||||||
builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW)));
|
builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW)));
|
||||||
builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL)));
|
builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL)));
|
||||||
builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER)));
|
builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER)));
|
||||||
builder.bBox(parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX)));
|
builder.bBox(parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
||||||
|
|
||||||
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
|
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS)));
|
||||||
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
|
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Rectangle2D parseRectangle2D(String bBox) {
|
private Rectangle2D parseRectangle2D(String bBox) {
|
||||||
|
|
||||||
List<Float> floats = Arrays.stream(bBox.split(DocumentStructure.RECTANGLE_DELIMITER))
|
List<Float> floats = Arrays.stream(bBox.split(DocumentStructureWrapper.RECTANGLE_DELIMITER))
|
||||||
.map(Float::parseFloat)
|
.map(Float::parseFloat)
|
||||||
.toList();
|
.toList();
|
||||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||||
@ -52,21 +52,21 @@ public class PropertiesMapper {
|
|||||||
|
|
||||||
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
||||||
|
|
||||||
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
|
public static List<Long> getUnsortedTextblockIds(Map<String, String> properties) {
|
||||||
|
|
||||||
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Long[] toLongArray(String ids) {
|
public static List<Long> toLongList(String ids) {
|
||||||
|
|
||||||
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
|
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
|
||||||
.map(Long::valueOf)
|
.map(Long::valueOf)
|
||||||
.toArray(Long[]::new);
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import java.util.Set;
|
|||||||
import com.knecon.fforesight.llm.service.document.DocumentTree;
|
import com.knecon.fforesight.llm.service.document.DocumentTree;
|
||||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -31,7 +31,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public abstract class AbstractSemanticNode implements GenericSemanticNode {
|
public abstract class AbstractSemanticNode implements GenericSemanticNode {
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
Set<LayoutEngineProto.LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM));
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
List<Integer> treeId;
|
List<Integer> treeId;
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||||
|
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||||
|
|
||||||
@ -35,7 +36,8 @@ public class Page {
|
|||||||
Integer width;
|
Integer width;
|
||||||
Integer rotation;
|
Integer rotation;
|
||||||
|
|
||||||
List<SemanticNode> mainBody;
|
|
||||||
|
List<AtomicTextBlock> textBlocksOnPage;
|
||||||
Header header;
|
Header header;
|
||||||
Footer footer;
|
Footer footer;
|
||||||
|
|
||||||
@ -53,13 +55,36 @@ public class Page {
|
|||||||
*/
|
*/
|
||||||
public TextBlock getMainBodyTextBlock() {
|
public TextBlock getMainBodyTextBlock() {
|
||||||
|
|
||||||
return mainBody.stream()
|
return textBlocksOnPage.stream()
|
||||||
.filter(SemanticNode::isLeaf)
|
.filter(atb -> !atb.isEmpty())
|
||||||
.map(SemanticNode::getTextBlock)
|
|
||||||
.collect(new TextBlockCollector());
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page.
|
||||||
|
*
|
||||||
|
* @return A list which contains the highes SemanticNodes, which appear only on this page.
|
||||||
|
*/
|
||||||
|
public List<SemanticNode> getMainBody() {
|
||||||
|
|
||||||
|
return textBlocksOnPage.stream()
|
||||||
|
.map(AtomicTextBlock::getParent)
|
||||||
|
.map(this::getHighestParentOnlyOnPage)
|
||||||
|
.distinct()
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) {
|
||||||
|
|
||||||
|
SemanticNode currentNode = node;
|
||||||
|
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
|
||||||
|
currentNode = currentNode.getParent();
|
||||||
|
}
|
||||||
|
return currentNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
|
|||||||
@ -21,7 +21,7 @@ import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
|||||||
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
|
||||||
|
|
||||||
public interface SemanticNode {
|
public interface SemanticNode {
|
||||||
|
|
||||||
@ -287,11 +287,10 @@ public interface SemanticNode {
|
|||||||
return getTextBlock().getSearchText().contains(string);
|
return getTextBlock().getSearchText().contains(string);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Set<LayoutEngineProto.LayoutEngine> getEngines();
|
||||||
Set<LayoutEngine> getEngines();
|
|
||||||
|
|
||||||
|
|
||||||
default void addEngine(LayoutEngine engine) {
|
default void addEngine(LayoutEngineProto.LayoutEngine engine) {
|
||||||
|
|
||||||
getEngines().add(engine);
|
getEngines().add(engine);
|
||||||
}
|
}
|
||||||
@ -669,4 +668,17 @@ public interface SemanticNode {
|
|||||||
return bBoxPerPage;
|
return bBoxPerPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
|
||||||
|
*
|
||||||
|
* @param page the page to check
|
||||||
|
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
|
||||||
|
*/
|
||||||
|
default boolean onlyOnPage(Page page) {
|
||||||
|
|
||||||
|
Set<Page> pages = getPages();
|
||||||
|
return pages.size() == 1 && pages.contains(page);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@ import com.knecon.fforesight.llm.service.document.DocumentTree;
|
|||||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -36,7 +36,7 @@ import lombok.experimental.FieldDefaults;
|
|||||||
public class Table implements SemanticNode {
|
public class Table implements SemanticNode {
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
Set<LayoutEngineProto.LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM));
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
List<Integer> treeId;
|
List<Integer> treeId;
|
||||||
DocumentTree documentTree;
|
DocumentTree documentTree;
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import static java.lang.String.format;
|
|||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.text.BreakIterator;
|
import java.text.BreakIterator;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -19,8 +18,8 @@ import com.knecon.fforesight.llm.service.document.RectangleTransformations;
|
|||||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||||
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
|
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -78,7 +77,10 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData atomicTextBlockData, DocumentPositionData atomicPositionBlockData, SemanticNode parent, Page page) {
|
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextDataProto.DocumentTextData atomicTextBlockData,
|
||||||
|
DocumentPositionDataProto.DocumentPositionData atomicPositionBlockData,
|
||||||
|
SemanticNode parent,
|
||||||
|
Page page) {
|
||||||
|
|
||||||
return AtomicTextBlock.builder()
|
return AtomicTextBlock.builder()
|
||||||
.id(atomicTextBlockData.getId())
|
.id(atomicTextBlockData.getId())
|
||||||
@ -86,20 +88,18 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
.page(page)
|
.page(page)
|
||||||
.textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
|
.textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
|
||||||
.searchText(atomicTextBlockData.getSearchText())
|
.searchText(atomicTextBlockData.getSearchText())
|
||||||
.lineBreaks(Arrays.stream(atomicTextBlockData.getLineBreaks()).boxed()
|
.lineBreaks(atomicTextBlockData.getLineBreaksList())
|
||||||
.toList())
|
.stringIdxToPositionIdx(atomicPositionBlockData.getStringIdxToPositionIdxList())
|
||||||
.stringIdxToPositionIdx(Arrays.stream(atomicPositionBlockData.getStringIdxToPositionIdx()).boxed()
|
.positions(toRectangle2DList(atomicPositionBlockData.getPositionsList()))
|
||||||
.toList())
|
|
||||||
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
|
|
||||||
.parent(parent)
|
.parent(parent)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
private static List<Rectangle2D> toRectangle2DList(List<DocumentPositionDataProto.DocumentPositionData.Position> positions) {
|
||||||
|
|
||||||
return Arrays.stream(positions)
|
return positions.stream()
|
||||||
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
|
.map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3)))
|
||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,14 @@
|
|||||||
package com.knecon.fforesight.llm.service.services;
|
package com.knecon.fforesight.llm.service.services;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Arrays;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -24,11 +31,14 @@ import com.azure.ai.openai.models.CompletionsUsage;
|
|||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.core.type.TypeReference;
|
import com.fasterxml.jackson.core.type.TypeReference;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.primitives.Floats;
|
||||||
|
import com.iqser.red.storage.commons.exception.StorageException;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
import com.knecon.fforesight.llm.service.ChunkingResponse;
|
import com.knecon.fforesight.llm.service.ChunkingResponse;
|
||||||
import com.knecon.fforesight.llm.service.LlmNerEntities;
|
import com.knecon.fforesight.llm.service.LlmNerEntities;
|
||||||
import com.knecon.fforesight.llm.service.LlmNerEntity;
|
import com.knecon.fforesight.llm.service.LlmNerEntity;
|
||||||
import com.knecon.fforesight.llm.service.LlmNerMessage;
|
import com.knecon.fforesight.llm.service.LlmNerMessage;
|
||||||
|
import com.knecon.fforesight.llm.service.LlmServiceSettings;
|
||||||
import com.knecon.fforesight.llm.service.SystemMessages;
|
import com.knecon.fforesight.llm.service.SystemMessages;
|
||||||
import com.knecon.fforesight.llm.service.document.DocumentData;
|
import com.knecon.fforesight.llm.service.document.DocumentData;
|
||||||
import com.knecon.fforesight.llm.service.document.DocumentGraphMapper;
|
import com.knecon.fforesight.llm.service.document.DocumentGraphMapper;
|
||||||
@ -36,10 +46,18 @@ import com.knecon.fforesight.llm.service.document.nodes.Document;
|
|||||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.llm.service.models.Chunk;
|
import com.knecon.fforesight.llm.service.models.Chunk;
|
||||||
import com.knecon.fforesight.llm.service.utils.FormattingUtils;
|
import com.knecon.fforesight.llm.service.utils.FormattingUtils;
|
||||||
|
import com.knecon.fforesight.llm.service.utils.StorageIdUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
@ -217,16 +235,227 @@ public class LlmNerService {
|
|||||||
private Document buildDocument(LlmNerMessage llmNerMessage) {
|
private Document buildDocument(LlmNerMessage llmNerMessage) {
|
||||||
|
|
||||||
DocumentData documentData = new DocumentData();
|
DocumentData documentData = new DocumentData();
|
||||||
documentData.setDocumentStructure(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentStructureStorageId(), DocumentStructure.class));
|
documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(fetchDocumentStructure(llmNerMessage.getDocumentStructureStorageId())));
|
||||||
documentData.setDocumentTextData(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentTextStorageId(), DocumentTextData[].class));
|
documentData.setDocumentTextData(fetchDocumentTextData(llmNerMessage.getDocumentTextStorageId()));
|
||||||
documentData.setDocumentPositionData(storageService.readJSONObject(TenantContext.getTenantId(),
|
documentData.setDocumentPositionData(fetchDocumentPositionData(llmNerMessage.getDocumentPositionStorageId()));
|
||||||
llmNerMessage.getDocumentPositionStorageId(),
|
documentData.setDocumentPages(fetchAllDocumentPages(llmNerMessage.getDocumentPagesStorageId()));
|
||||||
DocumentPositionData[].class));
|
|
||||||
documentData.setDocumentPages(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentPagesStorageId(), DocumentPage[].class));
|
|
||||||
return DocumentGraphMapper.toDocumentGraph(documentData);
|
return DocumentGraphMapper.toDocumentGraph(documentData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private DocumentStructureProto.DocumentStructure fetchDocumentStructure(String storageId) {
|
||||||
|
|
||||||
|
DocumentStructureProto.DocumentStructure documentStructure;
|
||||||
|
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
|
||||||
|
|
||||||
|
if (storageInfo.fileTypeExtension().contains("proto")) {
|
||||||
|
documentStructure = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentStructureProto.DocumentStructure.parser());
|
||||||
|
} else {
|
||||||
|
DocumentStructure oldDocumentStructure = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentStructure.class);
|
||||||
|
if (oldDocumentStructure == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
documentStructure = convertDocumentStructure(oldDocumentStructure);
|
||||||
|
}
|
||||||
|
|
||||||
|
return documentStructure;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private DocumentTextDataProto.AllDocumentTextData fetchDocumentTextData(String storageId) {
|
||||||
|
|
||||||
|
DocumentTextDataProto.AllDocumentTextData documentTextData;
|
||||||
|
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
|
||||||
|
|
||||||
|
if (storageInfo.fileTypeExtension().contains("proto")) {
|
||||||
|
documentTextData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentTextDataProto.AllDocumentTextData.parser());
|
||||||
|
} else {
|
||||||
|
DocumentTextData[] oldDocumentTextData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentTextData[].class);
|
||||||
|
if (oldDocumentTextData == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
documentTextData = convertAllDocumentTextData(oldDocumentTextData);
|
||||||
|
}
|
||||||
|
|
||||||
|
return documentTextData;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private DocumentPositionDataProto.AllDocumentPositionData fetchDocumentPositionData(String storageId) {
|
||||||
|
|
||||||
|
DocumentPositionDataProto.AllDocumentPositionData documentPositionData;
|
||||||
|
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
|
||||||
|
|
||||||
|
if (storageInfo.fileTypeExtension().contains("proto")) {
|
||||||
|
documentPositionData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPositionDataProto.AllDocumentPositionData.parser());
|
||||||
|
} else {
|
||||||
|
DocumentPositionData[] oldDocumentPositionData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPositionData[].class);
|
||||||
|
if (oldDocumentPositionData == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
documentPositionData = convertAllDocumentPositionData(oldDocumentPositionData);
|
||||||
|
}
|
||||||
|
|
||||||
|
return documentPositionData;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private DocumentPageProto.AllDocumentPages fetchAllDocumentPages(String storageId) {
|
||||||
|
|
||||||
|
DocumentPageProto.AllDocumentPages allDocumentPages;
|
||||||
|
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId);
|
||||||
|
|
||||||
|
if (storageInfo.fileTypeExtension().contains("proto")) {
|
||||||
|
allDocumentPages = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPageProto.AllDocumentPages.parser());
|
||||||
|
} else {
|
||||||
|
DocumentPage[] oldDocumentPages = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPage[].class);
|
||||||
|
if (oldDocumentPages == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
allDocumentPages = convertAllDocumentPages(oldDocumentPages);
|
||||||
|
}
|
||||||
|
|
||||||
|
return allDocumentPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private <T> T getOldData(String dossierId, String fileId, String fileType, Class<T> valueType) {
|
||||||
|
|
||||||
|
String oldStorageId = StorageIdUtils.getStorageId(dossierId, fileId, fileType, ".json");
|
||||||
|
System.out.println("----------------> LOOKING FOR " + oldStorageId);
|
||||||
|
try (InputStream inputStream = getObject(TenantContext.getTenantId(), oldStorageId)) {
|
||||||
|
return mapper.readValue(inputStream, valueType);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Could not read JSON for " + fileType + ", error was: " + e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static EntryDataProto.EntryData convertEntryData(DocumentStructure.EntryData oldEntryData) {
|
||||||
|
|
||||||
|
EntryDataProto.EntryData.Builder builder = EntryDataProto.EntryData.newBuilder();
|
||||||
|
|
||||||
|
builder.setType(NodeTypeProto.NodeType.valueOf(oldEntryData.getType().name()));
|
||||||
|
builder.addAllTreeId(Arrays.stream(oldEntryData.getTreeId()).boxed()
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
builder.addAllAtomicBlockIds(Arrays.asList(oldEntryData.getAtomicBlockIds()));
|
||||||
|
builder.addAllPageNumbers(Arrays.asList(oldEntryData.getPageNumbers()));
|
||||||
|
|
||||||
|
builder.putAllProperties(oldEntryData.getProperties());
|
||||||
|
|
||||||
|
if (oldEntryData.getChildren() != null) {
|
||||||
|
oldEntryData.getChildren()
|
||||||
|
.forEach(child -> builder.addChildren(convertEntryData(child)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentStructureProto.DocumentStructure convertDocumentStructure(DocumentStructure oldStructure) {
|
||||||
|
|
||||||
|
DocumentStructureProto.DocumentStructure.Builder newBuilder = DocumentStructureProto.DocumentStructure.newBuilder();
|
||||||
|
|
||||||
|
if (oldStructure.getRoot() != null) {
|
||||||
|
newBuilder.setRoot(convertEntryData(oldStructure.getRoot()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return newBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentPageProto.DocumentPage convertDocumentPage(DocumentPage oldPage) {
|
||||||
|
|
||||||
|
return DocumentPageProto.DocumentPage.newBuilder()
|
||||||
|
.setNumber(oldPage.getNumber())
|
||||||
|
.setHeight(oldPage.getHeight())
|
||||||
|
.setWidth(oldPage.getWidth())
|
||||||
|
.setRotation(oldPage.getRotation())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentPageProto.AllDocumentPages convertAllDocumentPages(DocumentPage[] oldPages) {
|
||||||
|
|
||||||
|
DocumentPageProto.AllDocumentPages.Builder allPagesBuilder = DocumentPageProto.AllDocumentPages.newBuilder();
|
||||||
|
|
||||||
|
for (DocumentPage oldPage : oldPages) {
|
||||||
|
DocumentPageProto.DocumentPage newPage = convertDocumentPage(oldPage);
|
||||||
|
allPagesBuilder.addDocumentPages(newPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
return allPagesBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentPositionDataProto.DocumentPositionData convertDocumentPositionData(DocumentPositionData oldData) {
|
||||||
|
|
||||||
|
DocumentPositionDataProto.DocumentPositionData.Builder builder = DocumentPositionDataProto.DocumentPositionData.newBuilder()
|
||||||
|
.setId(oldData.getId())
|
||||||
|
.addAllStringIdxToPositionIdx(Arrays.stream(oldData.getStringIdxToPositionIdx()).boxed()
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
for (float[] pos : oldData.getPositions()) {
|
||||||
|
DocumentPositionDataProto.DocumentPositionData.Position position = DocumentPositionDataProto.DocumentPositionData.Position.newBuilder()
|
||||||
|
.addAllValue(Floats.asList(pos))
|
||||||
|
.build();
|
||||||
|
builder.addPositions(position);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentPositionDataProto.AllDocumentPositionData convertAllDocumentPositionData(DocumentPositionData[] oldDataList) {
|
||||||
|
|
||||||
|
DocumentPositionDataProto.AllDocumentPositionData.Builder allDataBuilder = DocumentPositionDataProto.AllDocumentPositionData.newBuilder();
|
||||||
|
|
||||||
|
for (DocumentPositionData oldData : oldDataList) {
|
||||||
|
allDataBuilder.addDocumentPositionData(convertDocumentPositionData(oldData));
|
||||||
|
}
|
||||||
|
|
||||||
|
return allDataBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentTextDataProto.DocumentTextData convertDocumentTextData(DocumentTextData oldData) {
|
||||||
|
|
||||||
|
DocumentTextDataProto.DocumentTextData.Builder builder = DocumentTextDataProto.DocumentTextData.newBuilder()
|
||||||
|
.setId(oldData.getId())
|
||||||
|
.setPage(oldData.getPage())
|
||||||
|
.setSearchText(oldData.getSearchText())
|
||||||
|
.setNumberOnPage(oldData.getNumberOnPage())
|
||||||
|
.setStart(oldData.getStart())
|
||||||
|
.setEnd(oldData.getEnd())
|
||||||
|
.addAllLineBreaks(Arrays.stream(oldData.getLineBreaks()).boxed()
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static DocumentTextDataProto.AllDocumentTextData convertAllDocumentTextData(DocumentTextData[] oldDataList) {
|
||||||
|
|
||||||
|
DocumentTextDataProto.AllDocumentTextData.Builder allDataBuilder = DocumentTextDataProto.AllDocumentTextData.newBuilder();
|
||||||
|
|
||||||
|
for (DocumentTextData oldData : oldDataList) {
|
||||||
|
allDataBuilder.addDocumentTextData(convertDocumentTextData(oldData));
|
||||||
|
}
|
||||||
|
|
||||||
|
return allDataBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private InputStream getObject(String tenantId, String storageId) {
|
||||||
|
|
||||||
|
File tempFile = File.createTempFile("temp", ".data");
|
||||||
|
storageService.downloadTo(tenantId, storageId, tempFile);
|
||||||
|
return new BufferedInputStream(Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private record EntitiesWithUsage(List<LlmNerEntity> entities, CompletionsUsage completionsUsage) {
|
private record EntitiesWithUsage(List<LlmNerEntity> entities, CompletionsUsage completionsUsage) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,48 @@
|
|||||||
|
package com.knecon.fforesight.llm.service.utils;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class StorageIdUtils {
|
||||||
|
|
||||||
|
public static final String INVALID_STORAGE_ID_FORMAT = "Invalid storageId format";
|
||||||
|
|
||||||
|
|
||||||
|
public String getStorageId(String dossierId, String fileId, String fileName, String fileExtension) {
|
||||||
|
|
||||||
|
return dossierId + "/" + fileId + "." + fileName + fileExtension;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static StorageInfo parseStorageId(String storageId) {
|
||||||
|
|
||||||
|
String[] parts = storageId.split("/", 2);
|
||||||
|
|
||||||
|
if (parts.length < 2) {
|
||||||
|
throw new IllegalArgumentException(INVALID_STORAGE_ID_FORMAT);
|
||||||
|
}
|
||||||
|
|
||||||
|
String dossierId = parts[0];
|
||||||
|
String fileAndType = parts[1];
|
||||||
|
|
||||||
|
String[] fileParts = fileAndType.split("\\.");
|
||||||
|
|
||||||
|
if (fileParts.length < 3) {
|
||||||
|
throw new IllegalArgumentException(INVALID_STORAGE_ID_FORMAT);
|
||||||
|
}
|
||||||
|
|
||||||
|
String fileId = fileParts[0];
|
||||||
|
String fileTypeExtension = fileParts[fileParts.length - 1];
|
||||||
|
String fileTypeName = String.join(".", Arrays.copyOfRange(fileParts, 1, fileParts.length - 1));
|
||||||
|
|
||||||
|
return new StorageInfo(dossierId, fileId, fileTypeName, fileTypeExtension);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public record StorageInfo(String dossierId, String fileId, String fileTypeName, String fileTypeExtension) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -32,8 +32,10 @@ dependencies {
|
|||||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.3")
|
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.3")
|
||||||
implementation("org.springframework.boot:spring-boot-starter-websocket:$springBootVersion")
|
implementation("org.springframework.boot:spring-boot-starter-websocket:$springBootVersion")
|
||||||
implementation("org.springframework.security:spring-security-messaging:$springSecurityVersion")
|
implementation("org.springframework.security:spring-security-messaging:$springSecurityVersion")
|
||||||
implementation("com.iqser.red.commons:storage-commons:2.49.0")
|
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||||
implementation("com.knecon.fforesight:keycloak-commons:0.30.0")
|
implementation("com.knecon.fforesight:keycloak-commons:0.30.0") {
|
||||||
|
exclude(group = "com.knecon.fforesight", module = "tenant-commons")
|
||||||
|
}
|
||||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
|
implementation("com.knecon.fforesight:tenant-commons:0.30.0")
|
||||||
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
|
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
|
||||||
implementation("ch.qos.logback:logback-classic")
|
implementation("ch.qos.logback:logback-classic")
|
||||||
|
|||||||
@ -0,0 +1,45 @@
|
|||||||
|
package com.knecon.fforesight.llm.service;
|
||||||
|
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.llm.service.utils.StorageIdUtils;
|
||||||
|
|
||||||
|
public class StorageIdUtilsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testParseStorageId_ValidInput() {
|
||||||
|
StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId("dossierId/fileId.DOCUMENT_STRUCTURE.json");
|
||||||
|
assertEquals("dossierId", storageInfo.dossierId(), "Incorrect dossierId");
|
||||||
|
assertEquals("fileId", storageInfo.fileId(), "Incorrect fileId");
|
||||||
|
assertEquals("DOCUMENT_STRUCTURE", storageInfo.fileTypeName(), "Incorrect fileTypeName");
|
||||||
|
assertEquals("json", storageInfo.fileTypeExtension(), "Incorrect fileTypeExtension");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testParseStorageId_MissingFileTypeExtension() {
|
||||||
|
Exception exception = assertThrows(IllegalArgumentException.class, () ->
|
||||||
|
StorageIdUtils.parseStorageId("dossierId/fileId.DOCUMENT_STRUCTURE")
|
||||||
|
);
|
||||||
|
assertEquals("Invalid storageId format", exception.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testParseStorageId_InvalidFormat() {
|
||||||
|
Exception exception = assertThrows(IllegalArgumentException.class, () ->
|
||||||
|
StorageIdUtils.parseStorageId("invalidFormat")
|
||||||
|
);
|
||||||
|
assertEquals("Invalid storageId format", exception.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testParseStorageId_NoDotsInFilePart() {
|
||||||
|
Exception exception = assertThrows(IllegalArgumentException.class, () ->
|
||||||
|
StorageIdUtils.parseStorageId("dossierId/fileId")
|
||||||
|
);
|
||||||
|
assertEquals("Invalid storageId format", exception.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user