From fab4666dd7e90588252c723d89ec77f746a2fd0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Thu, 14 Nov 2024 16:29:25 +0100 Subject: [PATCH] RED-9319: move document to its own module --- .../llm-service-processor/build.gradle.kts | 4 +- .../ConsecutiveBoundaryCollector.java | 70 -- .../ConsecutiveTextBlockCollector.java | 78 -- .../llm/service/document/DocumentData.java | 36 - .../service/document/DocumentGraphMapper.java | 225 ------ .../llm/service/document/DocumentTree.java | 387 ---------- .../service/document/PropertiesMapper.java | 72 -- .../document/RectangleTransformations.java | 175 ----- .../llm/service/document/TextRange.java | 250 ------- .../service/document/entity/EntityType.java | 10 - .../llm/service/document/entity/IEntity.java | 30 - .../service/document/entity/IdBuilder.java | 46 -- .../document/entity/PositionOnPage.java | 25 - .../service/document/entity/TextEntity.java | 248 ------- .../document/nodes/AbstractSemanticNode.java | 73 -- .../llm/service/document/nodes/Document.java | 171 ----- .../document/nodes/DuplicatedParagraph.java | 35 - .../llm/service/document/nodes/Footer.java | 62 -- .../document/nodes/GenericSemanticNode.java | 5 - .../llm/service/document/nodes/Header.java | 65 -- .../llm/service/document/nodes/Headline.java | 100 --- .../llm/service/document/nodes/Image.java | 140 ---- .../llm/service/document/nodes/ImageType.java | 25 - .../llm/service/document/nodes/NodeType.java | 22 - .../llm/service/document/nodes/Page.java | 94 --- .../llm/service/document/nodes/Paragraph.java | 54 -- .../llm/service/document/nodes/Section.java | 90 --- .../document/nodes/SectionIdentifier.java | 158 ---- .../service/document/nodes/SemanticNode.java | 684 ------------------ .../service/document/nodes/SuperSection.java | 89 --- .../llm/service/document/nodes/Table.java | 306 -------- .../llm/service/document/nodes/TableCell.java | 84 --- .../document/textblock/AtomicTextBlock.java | 257 ------- .../textblock/ConcatenatedTextBlock.java | 268 ------- .../service/document/textblock/TextBlock.java | 176 ----- .../textblock/TextBlockCollector.java | 49 -- .../fforesight/llm/service/models/Chunk.java | 11 +- .../services/DocumentBuilderService.java | 267 +------ .../llm/service/services/LlmNerService.java | 20 +- .../llm-service-server/build.gradle.kts | 2 +- .../TenantExchangeMessageReceiverImpl.java | 67 -- .../TenantMessagingConfigurationImpl.java | 11 - .../queue/TenantQueueProviderConfig.java | 28 + .../websocket/WebSocketMessagingService.java | 1 - .../llm/service/LlmNerServiceTest.java | 2 +- .../src/test/resources/application.yaml | 2 +- .../tmp/AAA_LLM_ENTITIES/entities.json | 1 - 47 files changed, 69 insertions(+), 5006 deletions(-) delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveBoundaryCollector.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveTextBlockCollector.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentTree.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/RectangleTransformations.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/TextRange.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/EntityType.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IEntity.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IdBuilder.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/PositionOnPage.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/TextEntity.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Document.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/DuplicatedParagraph.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Footer.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/GenericSemanticNode.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Header.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Headline.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Image.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/ImageType.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/NodeType.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Paragraph.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Section.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SectionIdentifier.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SuperSection.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/TableCell.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/ConcatenatedTextBlock.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlock.java delete mode 100644 llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlockCollector.java delete mode 100644 llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantExchangeMessageReceiverImpl.java delete mode 100644 llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantMessagingConfigurationImpl.java create mode 100644 llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantQueueProviderConfig.java delete mode 100644 llm-service/llm-service-server/tmp/AAA_LLM_ENTITIES/entities.json diff --git a/llm-service/llm-service-processor/build.gradle.kts b/llm-service/llm-service-processor/build.gradle.kts index 39cc756..9da3fa6 100644 --- a/llm-service/llm-service-processor/build.gradle.kts +++ b/llm-service/llm-service-processor/build.gradle.kts @@ -13,10 +13,10 @@ extra["testcontainersVersion"] = "1.20.0" dependencies { implementation(project(":llm-service-api")) - implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.181.0") + implementation("com.knecon.fforesight:document:4.425.0-RED9139.13-RED9139.0-RED9139.0") implementation("com.iqser.red.commons:storage-commons:2.50.0") implementation("org.springframework.boot:spring-boot-starter:3.1.1") - implementation("com.knecon.fforesight:tenant-commons:0.30.0") { + implementation("com.knecon.fforesight:tenant-commons:0.31.0") { exclude(group = "com.iqser.red.commons", module = "storage-commons") } implementation("com.azure:azure-ai-openai:1.0.0-beta.10") diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveBoundaryCollector.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveBoundaryCollector.java deleted file mode 100644 index 34c1de6..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveBoundaryCollector.java +++ /dev/null @@ -1,70 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; - -import com.google.common.base.Functions; - -public class ConsecutiveBoundaryCollector implements Collector, List> { - - @Override - public Supplier> supplier() { - - return LinkedList::new; - } - - - @Override - public BiConsumer, TextRange> accumulator() { - - return (existingList, boundary) -> { - if (existingList.isEmpty()) { - existingList.add(boundary); - return; - } - - TextRange prevTextRange = existingList.get(existingList.size() - 1); - if (prevTextRange.end() > boundary.start()) { - throw new IllegalArgumentException(String.format("Can't concatenate %s and %s. Boundaries must be ordered!", prevTextRange, boundary)); - } - - if (prevTextRange.end() == boundary.start()) { - existingList.remove(existingList.size() - 1); - existingList.add(TextRange.merge(List.of(prevTextRange, boundary))); - } else { - existingList.add(boundary); - } - }; - } - - - @Override - public BinaryOperator> combiner() { - - return (list1, list2) -> { - list1.addAll(list2); - return list1; - }; - } - - - @Override - public Function, List> finisher() { - - return Functions.identity(); - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.IDENTITY_FINISH); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveTextBlockCollector.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveTextBlockCollector.java deleted file mode 100644 index 9d44070..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/ConsecutiveTextBlockCollector.java +++ /dev/null @@ -1,78 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.textblock.ConcatenatedTextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.NoArgsConstructor; - -@NoArgsConstructor -public class ConsecutiveTextBlockCollector implements Collector, List> { - - @Override - public Supplier> supplier() { - - return LinkedList::new; - } - - - @Override - public BiConsumer, TextBlock> accumulator() { - - return (existingList, textBlock) -> { - if (textBlock.isEmpty()) { - return; - } - if (existingList.isEmpty()) { - ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty(); - ctb.concat(textBlock); - existingList.add(ctb); - return; - } - - ConcatenatedTextBlock prevBlock = existingList.get(existingList.size() - 1); - - if (prevBlock.getTextRange().end() == textBlock.getTextRange().start()) { - prevBlock.concat(textBlock); - } else { - ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty(); - ctb.concat(textBlock); - existingList.add(ctb); - } - }; - } - - - @Override - public BinaryOperator> combiner() { - - return (list1, list2) -> Stream.concat(list1.stream(), list2.stream()) - .toList(); - } - - - @Override - public Function, List> finisher() { - - return a -> a.stream() - .map(tb -> (TextBlock) tb) - .toList(); - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.IDENTITY_FINISH); - } - -} \ No newline at end of file diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java deleted file mode 100644 index 744836c..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentData.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import java.io.Serializable; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class DocumentData implements Serializable { - - DocumentPageProto.AllDocumentPages documentPages; - DocumentTextDataProto.AllDocumentTextData documentTextData; - DocumentPositionDataProto.AllDocumentPositionData documentPositionData; - DocumentStructureWrapper documentStructureWrapper; - - - public DocumentStructureProto.DocumentStructure getDocumentStructure() { - - return documentStructureWrapper.getDocumentStructure(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java deleted file mode 100644 index 356ba1a..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentGraphMapper.java +++ /dev/null @@ -1,225 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto.DocumentPage; -import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto.AllDocumentPositionData; -import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto.AllDocumentTextData; -import static com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto.EntryData; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import com.knecon.fforesight.llm.service.document.nodes.Document; -import com.knecon.fforesight.llm.service.document.nodes.DuplicatedParagraph; -import com.knecon.fforesight.llm.service.document.nodes.Footer; -import com.knecon.fforesight.llm.service.document.nodes.Header; -import com.knecon.fforesight.llm.service.document.nodes.Headline; -import com.knecon.fforesight.llm.service.document.nodes.Image; -import com.knecon.fforesight.llm.service.document.nodes.Page; -import com.knecon.fforesight.llm.service.document.nodes.Paragraph; -import com.knecon.fforesight.llm.service.document.nodes.Section; -import com.knecon.fforesight.llm.service.document.nodes.SemanticNode; -import com.knecon.fforesight.llm.service.document.nodes.SuperSection; -import com.knecon.fforesight.llm.service.document.nodes.Table; -import com.knecon.fforesight.llm.service.document.nodes.TableCell; -import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class DocumentGraphMapper { - - public Document toDocumentGraph(DocumentData documentData) { - - Document document = new Document(); - DocumentTree documentTree = new DocumentTree(document); - Context context = new Context(documentData, documentTree); - - context.pageData.addAll(documentData.getDocumentPages().getDocumentPagesList() - .stream() - .map(DocumentGraphMapper::buildPage) - .toList()); - - context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildrenList(), context)); - - document.setDocumentTree(context.documentTree); - document.setPages(new HashSet<>(context.pageData)); - document.setNumberOfPages(documentData.getDocumentPages().getDocumentPagesCount()); - - document.setTextBlock(document.getTextBlock()); - return document; - } - - - private List buildEntries(List entries, Context context) { - - List newEntries = new ArrayList<>(entries.size()); - for (EntryData entryData : entries) { - - List pages = entryData.getPageNumbersList() - .stream() - .map(context::getPage) - .toList(); - - SemanticNode node = switch (entryData.getType()) { - case SECTION -> buildSection(context); - case SUPER_SECTION -> buildSuperSection(context); - case PARAGRAPH -> buildParagraph(context, entryData.getProperties()); - case HEADLINE -> buildHeadline(context); - case HEADER -> buildHeader(context); - case FOOTER -> buildFooter(context); - case TABLE -> buildTable(context, entryData.getProperties()); - case TABLE_CELL -> buildTableCell(context, entryData.getProperties()); - case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbersList()); - default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType()); - }; - - if (entryData.getAtomicBlockIdsCount() > 0) { - TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIdsList(), context, node); - node.setLeafTextBlock(textBlock); - - switch (entryData.getType()) { - case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); - case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); - case IMAGE -> pages.forEach(page -> page.getImages().add((Image) node)); - default -> textBlock.getAtomicTextBlocks() - .forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb)); - } - - } - List treeId = entryData.getTreeIdList(); - entryData.getEnginesList() - .forEach(node::addEngine); - node.setTreeId(treeId); - - newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildrenList(), context)).node(node).build()); - } return newEntries; - } - - - private Headline buildHeadline(Context context) { - - return Headline.builder().documentTree(context.documentTree).build(); - } - - - private Image buildImage(Context context, Map properties, List pageNumbers) { - - assert pageNumbers.size() == 1; - Page page = context.getPage(pageNumbers.get(0)); - var builder = Image.builder(); - PropertiesMapper.parseImageProperties(properties, builder); - return builder.documentTree(context.documentTree).page(page).build(); - } - - - private TableCell buildTableCell(Context context, Map properties) { - - TableCell.TableCellBuilder builder = TableCell.builder(); - PropertiesMapper.parseTableCellProperties(properties, builder); - return builder.documentTree(context.documentTree).build(); - } - - - private Table buildTable(Context context, Map properties) { - - Table.TableBuilder builder = Table.builder(); - PropertiesMapper.parseTableProperties(properties, builder); - return builder.documentTree(context.documentTree).build(); - } - - - private Footer buildFooter(Context context) { - - return Footer.builder().documentTree(context.documentTree).build(); - } - - - private Header buildHeader(Context context) { - - return Header.builder().documentTree(context.documentTree).build(); - } - - - private Section buildSection(Context context) { - - return Section.builder().documentTree(context.documentTree).build(); - } - - - private SuperSection buildSuperSection(Context context) { - - return SuperSection.builder().documentTree(context.documentTree).build(); - } - - - private Paragraph buildParagraph(Context context, Map properties) { - - if (PropertiesMapper.isDuplicateParagraph(properties)) { - - DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build(); - - var unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties); - duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph)); - return duplicatedParagraph; - } - - return Paragraph.builder().documentTree(context.documentTree).build(); - } - - - private TextBlock toTextBlock(List atomicTextBlockIds, Context context, SemanticNode parent) { - - return atomicTextBlockIds.stream() - .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)) - .collect(new TextBlockCollector()); - } - - - private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { - - return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)), - context.documentPositionData.getDocumentPositionData(Math.toIntExact(atomicTextBlockId)), - parent, - context.getPage(context.documentTextData.getDocumentTextData(Math.toIntExact(atomicTextBlockId)).getPage())); - } - - - private Page buildPage(DocumentPage p) { - - return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build(); - } - - - static final class Context { - - private final DocumentTree documentTree; - private final List pageData; - private final AllDocumentTextData documentTextData; - private final AllDocumentPositionData documentPositionData; - - - Context(DocumentData documentData, DocumentTree documentTree) { - - this.documentTree = documentTree; - this.pageData = new ArrayList<>(); - this.documentTextData = documentData.getDocumentTextData(); - this.documentPositionData = documentData.getDocumentPositionData(); - - } - - - private Page getPage(Long pageIndex) { - - Page page = pageData.get(Math.toIntExact(pageIndex) - 1); - assert page.getNumber() == Math.toIntExact(pageIndex); - return page; - } - - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentTree.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentTree.java deleted file mode 100644 index 0a26415..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/DocumentTree.java +++ /dev/null @@ -1,387 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import static java.lang.String.format; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.nodes.Document; -import com.knecon.fforesight.llm.service.document.nodes.GenericSemanticNode; -import com.knecon.fforesight.llm.service.document.nodes.NodeType; -import com.knecon.fforesight.llm.service.document.nodes.SemanticNode; -import com.knecon.fforesight.llm.service.document.nodes.Table; -import com.knecon.fforesight.llm.service.document.nodes.TableCell; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.experimental.FieldDefaults; - -@Data -@EqualsAndHashCode -public class DocumentTree { - - private final Entry root; - - - public DocumentTree(Document document) { - - root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build(); - } - - - public TextBlock buildTextBlock() { - - return allEntriesInOrder().map(Entry::getNode) - .filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); - } - - - public List createNewMainEntryAndReturnId(GenericSemanticNode node) { - - return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node); - } - - - public List createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) { - - return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node); - } - - - public List createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) { - - return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node); - } - - - public List createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) { - - return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell); - } - - - @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong - private List createNewChildEntryAndReturnIdImpl(List parentId, SemanticNode node) { - - if (!entryExists(parentId)) { - throw new IllegalArgumentException(format("parentId %s does not exist!", parentId)); - } - - Entry parent = getEntryById(parentId); - List newId = new LinkedList<>(parentId); - newId.add(parent.children.size()); - parent.children.add(Entry.builder().treeId(newId).node(node).build()); - - return newId; - } - - - private boolean entryExists(List treeId) { - - if (treeId.isEmpty()) { - return root != null; - } - Entry entry = root; - for (int id : treeId) { - if (id >= entry.children.size() || 0 > id) { - return false; - } - entry = entry.children.get(id); - } - return true; - } - - - public Entry getParentEntryById(List treeId) { - - return getEntryById(getParentId(treeId)); - } - - - public boolean hasParentById(List treeId) { - - return !treeId.isEmpty(); - } - - - public Stream childNodes(List treeId) { - - return getEntryById(treeId).children.stream() - .map(Entry::getNode); - } - - - /** - * Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search. - * It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes. - * - * @param treeId the treeId of the Entry whose children shall be checked. - * @param textRange The TextRange to find intersecting childNodes for. - * @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange - */ - public List findIntersectingChildNodes(List treeId, TextRange textRange) { - - List childEntries = getEntryById(treeId).getChildren(); - List intersectingChildEntries = new LinkedList<>(); - int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start()); - if (startIdx < 0) { - return intersectingChildEntries; - } - for (int i = startIdx; i < childEntries.size(); i++) { - if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) { - intersectingChildEntries.add(childEntries.get(i).getNode()); - } else { - break; - } - } - return intersectingChildEntries; - } - - - public Optional findFirstContainingChild(List treeId, TextRange textRange) { - - List childEntries = getEntryById(treeId).getChildren(); - int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start()); - if (startIdx < 0) { - return Optional.empty(); - } - - if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) { - return Optional.of(childEntries.get(startIdx).getNode()); - } - - return Optional.empty(); - } - - - public Optional findTableCellInTable(List treeId, int start, int end) { - - return findTableCellInTableRecursively(getEntryById(treeId).getChildren(), start, end); - } - - - private Optional findTableCellInTableRecursively(List entries, int start, int end) { - - int startIdx = findFirstIdxOfContainingChildBinarySearch(entries, start); - if (startIdx < 0) { - return Optional.empty(); - } - - Entry entry = entries.get(startIdx); - - if (entry.getNode().getTextRange().contains(end) && entry.getNode() instanceof TableCell tableCell) { - if (!entry.getNode().isLeaf()) { - Optional foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end); - if (foundInChildren.isPresent()) { - return foundInChildren; - } - } - return Optional.of(tableCell); - } - - if (!entry.getNode().isLeaf()) { - Optional foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end); - if (foundInChildren.isPresent()) { - return foundInChildren; - } - } - - return Optional.empty(); - } - - - private int findFirstIdxOfContainingChildBinarySearch(List childNodes, int start) { - - int low = 0; - int high = childNodes.size() - 1; - while (low <= high) { - int mid = low + (high - low) / 2; - TextRange range = childNodes.get(mid).getNode().getTextRange(); - if (range.start() > start) { - high = mid - 1; - } else if (range.end() <= start) { - low = mid + 1; - } else { - return mid; - } - } - return -1; - } - - - public Stream childNodesOfType(List treeId, NodeType nodeType) { - - return getEntryById(treeId).children.stream() - .filter(entry -> entry.node.getType().equals(nodeType)) - .map(Entry::getNode); - } - - - private static List getParentId(List treeId) { - - if (treeId.isEmpty()) { - throw new UnsupportedOperationException("Root has no parent!"); - } - if (treeId.size() < 2) { - return Collections.emptyList(); - } - return treeId.subList(0, treeId.size() - 1); - } - - - public Optional getNextSibling(List treeId) { - - var siblingTreeId = getNextSiblingId(treeId); - if (!entryExists(siblingTreeId)) { - return Optional.empty(); - } - return Optional.of(getEntryById(siblingTreeId).getNode()); - } - - - public List getNextSiblingId(List treeId) { - - List siblingTreeId = new LinkedList<>(); - for (int i = 0; i < treeId.size() - 1; i++) { - siblingTreeId.add(treeId.get(i)); - } - siblingTreeId.add(treeId.get(treeId.size() - 1) + 1); - return siblingTreeId; - } - - - public Optional getPreviousSibling(List treeId) { - - var siblingTreeId = getPreviousSiblingId(treeId); - if (!entryExists(siblingTreeId)) { - return Optional.empty(); - } - return Optional.of(getEntryById(siblingTreeId).getNode()); - } - - - public List getPreviousSiblingId(List treeId) { - - List siblingTreeId = new LinkedList<>(); - for (int i = 0; i < treeId.size() - 1; i++) { - siblingTreeId.add(treeId.get(i)); - } - siblingTreeId.add(treeId.get(treeId.size() - 1) - 1); - return siblingTreeId; - } - - - public Entry getEntryById(List treeId) { - - if (treeId.isEmpty()) { - return root; - } - Entry entry = root; - for (int id : treeId) { - entry = entry.children.get(id); - } - return entry; - } - - - public Optional findEntryById(List treeId) { - - if (treeId.isEmpty()) { - return Optional.of(root); - } - Entry entry = root; - for (int id : treeId) { - if (entry.children.size() <= id) { - return Optional.empty(); - } - entry = entry.children.get(id); - } - return Optional.of(entry); - } - - - public Stream mainEntries() { - - return root.children.stream(); - } - - - public Stream allEntriesInOrder() { - - return Stream.of(root) - .flatMap(DocumentTree::flatten); - } - - - public Stream allSubEntriesInOrder(List parentId) { - - return getEntryById(parentId).children.stream() - .flatMap(DocumentTree::flatten); - } - - - @Override - public String toString() { - - return String.join("\n", - allEntriesInOrder().map(Entry::toString) - .toList()); - } - - - private static Stream flatten(Entry entry) { - - return Stream.concat(Stream.of(entry), - entry.children.stream() - .flatMap(DocumentTree::flatten)); - } - - - public SemanticNode getHighestParentById(List treeId) { - - if (treeId.isEmpty()) { - return root.node; - } - return root.children.get(treeId.get(0)).node; - } - - - @Builder - @Getter - @AllArgsConstructor - @FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) - public static class Entry { - - List treeId; - SemanticNode node; - @Builder.Default - List children = new ArrayList<>(); - - - @Override - public String toString() { - - return node.toString(); - } - - - public NodeType getType() { - - return node.getType(); - } - - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java deleted file mode 100644 index 60f9d78..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/PropertiesMapper.java +++ /dev/null @@ -1,72 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.List; -import java.util.Map; - -import com.knecon.fforesight.llm.service.document.nodes.Image; -import com.knecon.fforesight.llm.service.document.nodes.ImageType; -import com.knecon.fforesight.llm.service.document.nodes.Table; -import com.knecon.fforesight.llm.service.document.nodes.TableCell; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class PropertiesMapper { - - public void parseImageProperties(Map properties, Image.ImageBuilder builder) { - - builder.imageType(ImageType.fromString(properties.get(DocumentStructureWrapper.ImageProperties.IMAGE_TYPE))); - builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.ImageProperties.TRANSPARENT))); - builder.position(parseRectangle2D(properties.get(DocumentStructureWrapper.ImageProperties.POSITION))); - builder.id(properties.get(DocumentStructureWrapper.ImageProperties.ID)); - } - - - public void parseTableCellProperties(Map properties, TableCell.TableCellBuilder builder) { - - builder.row(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.ROW))); - builder.col(Integer.parseInt(properties.get(DocumentStructureWrapper.TableCellProperties.COL))); - builder.header(Boolean.parseBoolean(properties.get(DocumentStructureWrapper.TableCellProperties.HEADER))); - builder.bBox(parseRectangle2D(properties.get(DocumentStructureWrapper.TableCellProperties.B_BOX))); - } - - - public void parseTableProperties(Map properties, Table.TableBuilder builder) { - - builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_ROWS))); - builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructureWrapper.TableProperties.NUMBER_OF_COLS))); - } - - - private Rectangle2D parseRectangle2D(String bBox) { - - List floats = Arrays.stream(bBox.split(DocumentStructureWrapper.RECTANGLE_DELIMITER)) - .map(Float::parseFloat) - .toList(); - return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); - } - - - public static boolean isDuplicateParagraph(Map properties) { - - return properties.containsKey(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID); - } - - - public static List getUnsortedTextblockIds(Map properties) { - - return toLongList(properties.get(DocumentStructureWrapper.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID)); - } - - - public static List toLongList(String ids) { - - return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")) - .map(Long::valueOf) - .toList(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/RectangleTransformations.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/RectangleTransformations.java deleted file mode 100644 index edcc87b..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/RectangleTransformations.java +++ /dev/null @@ -1,175 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import java.awt.geom.Rectangle2D; -import java.awt.geom.RectangularShape; -import java.util.Collection; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; - -import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; - -import lombok.AllArgsConstructor; -import lombok.NoArgsConstructor; - -public class RectangleTransformations { - - public static Rectangle2D atomicTextBlockBBox(List atomicTextBlocks) { - - return atomicTextBlocks.stream() - .flatMap(atomicTextBlock -> atomicTextBlock.getPositions() - .stream()) - .collect(new Rectangle2DBBoxCollector()); - } - - - - public static Rectangle2D rectangle2DBBox(Collection rectangle2DList) { - - return rectangle2DList.stream() - .collect(new Rectangle2DBBoxCollector()); - } - - - /** - * If two rectangles are further apart than five times the average width of a rectangle, a gap is inserted. - * - * @param rectangle2DList A list of rectangles to combine - * @return A list of rectangles which are combined if they are closer than the split threshold - */ - public static List rectangleBBoxWithGaps(List rectangle2DList) { - - if (rectangle2DList.isEmpty()) { - return Collections.emptyList(); - } - double splitThreshold = rectangle2DList.stream() - .mapToDouble(RectangularShape::getWidth).average() - .orElse(5) * 5.0; - - List> rectangleListsWithGaps = new LinkedList<>(); - List rectangleListWithoutGaps = new LinkedList<>(); - rectangleListsWithGaps.add(rectangleListWithoutGaps); - Rectangle2D previousRectangle = rectangle2DList.get(0); - for (Rectangle2D currentRectangle : rectangle2DList) { - if (Math.abs(currentRectangle.getMinX() - previousRectangle.getMaxX()) > splitThreshold) { - rectangleListWithoutGaps = new LinkedList<>(); - rectangleListWithoutGaps.add(currentRectangle); - rectangleListsWithGaps.add(rectangleListWithoutGaps); - previousRectangle = currentRectangle; - } else { - rectangleListWithoutGaps.add(currentRectangle); - previousRectangle = currentRectangle; - } - } - return rectangleListsWithGaps.stream() - .map(RectangleTransformations::rectangle2DBBox) - .toList(); - } - - - public static Collector collectBBox() { - - return new Rectangle2DBBoxCollector(); - } - - - private static class Rectangle2DBBoxCollector implements Collector { - - @Override - public Supplier supplier() { - - return BBox::new; - } - - - @Override - public BiConsumer accumulator() { - - return BBox::addRectangle; - } - - - @Override - public BinaryOperator combiner() { - - return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX), - Math.min(b1.lowerLeftY, b2.lowerLeftY), - Math.max(b1.upperRightX, b2.upperRightX), - Math.max(b1.upperRightY, b2.upperRightY)); - } - - - @Override - public Function finisher() { - - return BBox::toRectangle2D; - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.UNORDERED); - } - - - @AllArgsConstructor - @NoArgsConstructor - private static class BBox { - - Double lowerLeftX; - Double lowerLeftY; - Double upperRightX; - Double upperRightY; - - - public Rectangle2D toRectangle2D() { - - if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) { - return new Rectangle2D.Double(0, 0, 0, 0); - } - return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY); - } - - - public void addRectangle(Rectangle2D rectangle2D) { - - double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX()); - double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()); - double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX()); - double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY()); - - if (this.lowerLeftX == null) { - this.lowerLeftX = lowerLeftX; - } else if (this.lowerLeftX > lowerLeftX) { - this.lowerLeftX = lowerLeftX; - } - if (this.lowerLeftY == null) { - this.lowerLeftY = lowerLeftY; - } else if (this.lowerLeftY > lowerLeftY) { - this.lowerLeftY = lowerLeftY; - } - if (this.upperRightX == null) { - this.upperRightX = upperRightX; - } else if (this.upperRightX < upperRightX) { - this.upperRightX = upperRightX; - } - if (this.upperRightY == null) { - this.upperRightY = upperRightY; - } else if (this.upperRightY < upperRightY) { - this.upperRightY = upperRightY; - } - - } - - } - - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/TextRange.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/TextRange.java deleted file mode 100644 index f82fc9f..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/TextRange.java +++ /dev/null @@ -1,250 +0,0 @@ -package com.knecon.fforesight.llm.service.document; - -import static java.lang.String.format; - -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; - -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.EqualsAndHashCode; -import lombok.Setter; - -/** - * Represents a range of text defined by a start and end index. - * Provides functionality to check containment, intersection, and to adjust ranges based on specified conditions. - */ -@Setter -@EqualsAndHashCode -@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName") -public class TextRange implements Comparable { - - private int start; - private int end; - - - /** - * Constructs a TextRange with specified start and end indexes. - * - * @param start The starting index of the range. - * @param end The ending index of the range. - * @throws IllegalArgumentException If start is greater than end. - */ - public TextRange(int start, int end) { - - if (start > end) { - throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); - } - this.start = start; - this.end = end; - } - - - /** - * Returns the length of the text range. - * - * @return The length of the range. - */ - public int length() { - - return end - start; - } - - - public int start() { - - return start; - } - - - public int end() { - - return end; - } - - - /** - * Checks if this {@link TextRange} fully contains another TextRange. - * - * @param textRange The {@link TextRange} to check. - * @return true if this range contains the specified range, false otherwise. - */ - public boolean contains(TextRange textRange) { - - return start <= textRange.start() && textRange.end() <= end; - } - - - /** - * Checks if this {@link TextRange} is fully contained by another TextRange. - * - * @param textRange The {@link TextRange} to check against. - * @return true if this range is contained by the specified range, false otherwise. - */ - public boolean containedBy(TextRange textRange) { - - return textRange.contains(this); - } - - - /** - * Checks if this {@link TextRange} contains another range specified by start and end indices. - * - * @param start The starting index of the range to check. - * @param end The ending index of the range to check. - * @return true if this range fully contains the specified range, false otherwise. - * @throws IllegalArgumentException If the start index is greater than the end index. - */ - public boolean contains(int start, int end) { - - if (start > end) { - throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); - } - return this.start <= start && end <= this.end; - } - - - /** - * Checks if this {@link TextRange} is fully contained within another range specified by start and end indices. - * - * @param start The starting index of the outer range. - * @param end The ending index of the outer range. - * @return true if this range is fully contained within the specified range, false otherwise. - * @throws IllegalArgumentException If the start index is greater than the end index. - */ - public boolean containedBy(int start, int end) { - - if (start > end) { - throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); - } - return start <= this.start && this.end <= end; - } - - - /** - * Determines if the specified index is within this {@link TextRange}. - * - * @param index The index to check. - * @return true if the index is within the range (inclusive of the start and exclusive of the end), false otherwise. - */ - public boolean contains(int index) { - - return start <= index && index < end; - } - - - /** - * Checks if this {@link TextRange} intersects with another {@link TextRange}. - * - * @param textRange The {@link TextRange} to check for intersection. - * @return true if the ranges intersect, false otherwise. - */ - public boolean intersects(TextRange textRange) { - - return textRange.start() < this.end && this.start < textRange.end(); - } - - - /** - * Splits this TextRange into multiple ranges based on a list of indices. - * - * @param splitIndices The indices at which to split the range. - * @return A list of TextRanges resulting from the split. - * @throws IndexOutOfBoundsException If any split index is outside this TextRange. - */ - public List split(List splitIndices) { - - if (splitIndices.stream() - .anyMatch(idx -> !this.contains(idx))) { - throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", - splitIndices.stream() - .filter(idx -> !this.contains(idx)) - .toList(), - this)); - } - List splitBoundaries = new LinkedList<>(); - int previousIndex = start; - for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) { - int splitIndex = splitIndices.get(i); - - // skip split if it would produce a boundary of length 0 - if (splitIndex == previousIndex) { - continue; - } - splitBoundaries.add(new TextRange(previousIndex, splitIndex)); - previousIndex = splitIndex; - } - splitBoundaries.add(new TextRange(previousIndex, end)); - return splitBoundaries; - } - - - /** - * Merges a collection of TextRanges into a single Text range encompassing all. - * - * @param boundaries The collection of TextRanges to merge. - * @return A new TextRange covering the entire span of the given ranges. - * @throws IllegalArgumentException If boundaries are empty. - */ - public static TextRange merge(Collection boundaries) { - - int minStart = boundaries.stream() - .mapToInt(TextRange::start) - .min() - .orElseThrow(IllegalArgumentException::new); - int maxEnd = boundaries.stream() - .mapToInt(TextRange::end) - .max() - .orElseThrow(IllegalArgumentException::new); - return new TextRange(minStart, maxEnd); - } - - - @Override - public String toString() { - - return format("Boundary [%d|%d)", start, end); - } - - - @Override - public int compareTo(TextRange textRange) { - - if (end < textRange.end() && start < textRange.start()) { - return -1; - } - if (start > textRange.start() && end > textRange.end()) { - return 1; - } - - return 0; - } - - - /** - * Shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without trailing or preceding whitespaces. - * - * @param textBlock TextBlock to check whitespaces against - * @return Trimmed boundary - */ - public TextRange trim(TextBlock textBlock) { - - if (this.length() == 0) { - return this; - } - - int trimmedStart = this.start; - while (textBlock.containsIndex(trimmedStart) && trimmedStart < end && Character.isWhitespace(textBlock.charAt(trimmedStart))) { - trimmedStart++; - } - - int trimmedEnd = this.end; - while (textBlock.containsIndex(trimmedEnd - 1) && trimmedStart < trimmedEnd && Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) { - trimmedEnd--; - } - - return new TextRange(trimmedStart, Math.max(trimmedEnd, trimmedStart)); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/EntityType.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/EntityType.java deleted file mode 100644 index 6d0673c..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/EntityType.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.knecon.fforesight.llm.service.document.entity; - -public enum EntityType { - ENTITY, - HINT, - RECOMMENDATION, - FALSE_POSITIVE, - FALSE_RECOMMENDATION, - DICTIONARY_REMOVAL -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IEntity.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IEntity.java deleted file mode 100644 index 3c8af0b..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IEntity.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.knecon.fforesight.llm.service.document.entity; - -import com.knecon.fforesight.llm.service.document.TextRange; - -public interface IEntity { - - /** - * Gets the value of this entity as a string. - * - * @return The string value. - */ - String getValue(); - - - /** - * Gets the range of text in the document associated with this entity. - * - * @return The text range. - */ - TextRange getTextRange(); - - - /** - * Gets the type of this entity. - * - * @return The entity type. - */ - String type(); - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IdBuilder.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IdBuilder.java deleted file mode 100644 index 15eb4f3..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/IdBuilder.java +++ /dev/null @@ -1,46 +0,0 @@ -package com.knecon.fforesight.llm.service.document.entity; - -import java.awt.geom.Rectangle2D; -import java.nio.charset.StandardCharsets; -import java.util.Comparator; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.knecon.fforesight.llm.service.document.nodes.Page; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public final class IdBuilder { - - private final HashFunction hashFunction = Hashing.murmur3_128(); - - - public String buildId(Set pages, List rectanglesPerLine, String type, String entityType) { - - return buildId(pages.stream() - .map(Page::getNumber) - .collect(Collectors.toList()), rectanglesPerLine, type, entityType); - } - - - public String buildId(List pageNumbers, List rectanglesPerLine, String type, String entityType) { - - StringBuilder sb = new StringBuilder(); - sb.append(type).append(entityType); - List sortedPageNumbers = pageNumbers.stream() - .sorted(Comparator.comparingInt(Integer::intValue)) - .toList(); - sortedPageNumbers.forEach(sb::append); - rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX())) - .append(Math.round(rectangle2D.getY())) - .append(Math.round(rectangle2D.getWidth())) - .append(Math.round(rectangle2D.getHeight()))); - - return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/PositionOnPage.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/PositionOnPage.java deleted file mode 100644 index 2c82166..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/PositionOnPage.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.knecon.fforesight.llm.service.document.entity; - -import java.awt.geom.Rectangle2D; -import java.util.List; - -import com.knecon.fforesight.llm.service.document.nodes.Page; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.experimental.FieldDefaults; - -@Data -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class PositionOnPage { - - // Each entry in this list corresponds to an entry in the redaction log, this means: - // A single entity might be represented by multiple redaction log entries - // This is due to the RedactionLog only being able to handle a single page per entry. - final String id; - Page page; - List rectanglePerLine; - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/TextEntity.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/TextEntity.java deleted file mode 100644 index 72bf9b8..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/entity/TextEntity.java +++ /dev/null @@ -1,248 +0,0 @@ -package com.knecon.fforesight.llm.service.document.entity; - -import java.awt.geom.Rectangle2D; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.llm.service.document.TextRange; -import com.knecon.fforesight.llm.service.document.nodes.Page; -import com.knecon.fforesight.llm.service.document.nodes.SemanticNode; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true) -@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName") -public class TextEntity implements IEntity { - - // primary key - @EqualsAndHashCode.Include - final String id; - // primary key end - - TextRange textRange; - @Builder.Default - List duplicateTextRanges = new ArrayList<>(); - String type; // TODO: make final once ManualChangesApplicationService::recategorize is deleted - final EntityType entityType; - - // inferred on graph insertion - String value; - String textBefore; - String textAfter; - @Builder.Default - Set pages = new HashSet<>(); - List positionsOnPagePerPage; - @Builder.Default - List intersectingNodes = new LinkedList<>(); - SemanticNode deepestFullyContainingNode; - - - public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, SemanticNode node) { - - return TextEntity.builder().id(buildId(node, textRange, type, entityType)).type(type).entityType(entityType).textRange(textRange).build(); - } - - - public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id) { - - return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build(); - } - - - public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id, String manualOverwriteSection) { - - return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build(); - } - - - private static String buildId(SemanticNode node, TextRange textRange, String type, EntityType entityType) { - - Map> rectanglesPerLinePerPage = node.getPositionsPerPage(textRange); - return IdBuilder.buildId(rectanglesPerLinePerPage.keySet(), - rectanglesPerLinePerPage.values() - .stream() - .flatMap(Collection::stream) - .toList(), - type, - entityType.name()); - } - - - public void addTextRange(TextRange textRange) { - - duplicateTextRanges.add(textRange); - } - - - public boolean occursInNodeOfType(Class clazz) { - - return intersectingNodes.stream() - .anyMatch(clazz::isInstance); - } - - - public boolean occursInNode(SemanticNode semanticNode) { - - return intersectingNodes.stream() - .anyMatch(node -> node.equals(semanticNode)); - } - - - public boolean isType(String type) { - - return type().equals(type); - } - - - public boolean isAnyType(List types) { - - return types.contains(type()); - } - - - public void addIntersectingNode(SemanticNode containingNode) { - - intersectingNodes.add(containingNode); - } - - - public String getValueWithLineBreaks() { - - return getDeepestFullyContainingNode().getTextBlock().subSequenceWithLineBreaks(getTextRange()); - } - - - public void removeFromGraph() { - - intersectingNodes.forEach(node -> node.getEntities().remove(this)); - pages.forEach(page -> page.getEntities().remove(this)); - intersectingNodes = new LinkedList<>(); - deepestFullyContainingNode = null; - pages = new HashSet<>(); - } - - - public List getPositionsOnPagePerPage() { - - if (positionsOnPagePerPage == null || positionsOnPagePerPage.isEmpty()) { - Map> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange); - - Page firstPage = rectanglesPerLinePerPage.keySet() - .stream() - .min(Comparator.comparingInt(Page::getNumber)) - .orElseThrow(() -> new RuntimeException("No Positions found on any page!")); - - positionsOnPagePerPage = rectanglesPerLinePerPage.entrySet() - .stream() - .map(entry -> buildPositionOnPage(firstPage, id, entry)) - .toList(); - } - return positionsOnPagePerPage; - } - - - private static PositionOnPage buildPositionOnPage(Page firstPage, String id, Map.Entry> entry) { - - if (entry.getKey().equals(firstPage)) { - return new PositionOnPage(id, entry.getKey(), entry.getValue()); - } else { - return new PositionOnPage(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue()); - } - } - - - public boolean containedBy(TextEntity textEntity) { - - return textEntity.contains(this); - } - - - public boolean contains(TextEntity textEntity) { - - if (this.textRange.contains(textEntity.getTextRange())) { - return true; - } - - List textEntityDuplicateRanges = textEntity.getDuplicateTextRanges(); - // use optimized indexed loops for extra performance boost - for (int i = 0, duplicateTextRangesSize = duplicateTextRanges.size(); i < duplicateTextRangesSize; i++) { - TextRange duplicateTextRange = duplicateTextRanges.get(i); - if (duplicateTextRange.contains(textEntity.getTextRange())) { - return true; - } - for (int j = 0, textEntityDuplicateRangesSize = textEntityDuplicateRanges.size(); j < textEntityDuplicateRangesSize; j++) { - TextRange otherRange = textEntityDuplicateRanges.get(j); - if (duplicateTextRange.contains(otherRange)) { - return true; - } - } - } - - return false; - } - - - public boolean intersects(TextEntity textEntity) { - - return this.textRange.intersects(textEntity.getTextRange()) // - || duplicateTextRanges.stream() - .anyMatch(duplicateTextRange -> duplicateTextRange.intersects(textEntity.textRange)) // - || duplicateTextRanges.stream() - .anyMatch(duplicateTextRange -> textEntity.getDuplicateTextRanges() - .stream() - .anyMatch(duplicateTextRange::intersects)); - } - - - public boolean matchesAnnotationId(String manualRedactionId) { - - return getPositionsOnPagePerPage().stream() - .anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId)); - } - - - @Override - public String toString() { - - StringBuilder sb = new StringBuilder(); - sb.append("Entity[\""); - sb.append(value); - sb.append("\", "); - sb.append(textRange); - sb.append(", pages["); - pages.forEach(page -> { - sb.append(page.getNumber()); - sb.append(", "); - }); - sb.delete(sb.length() - 2, sb.length()); - sb.append("], type = \""); - sb.append(type()); - sb.append("\", EntityType."); - sb.append(entityType); - sb.append("]"); - return sb.toString(); - } - - - @Override - public String type() { - - return getType(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java deleted file mode 100644 index df2737e..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/AbstractSemanticNode.java +++ /dev/null @@ -1,73 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.llm.service.document.DocumentTree; -import com.knecon.fforesight.llm.service.document.entity.TextEntity; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Data -@SuperBuilder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true) -public abstract class AbstractSemanticNode implements GenericSemanticNode { - - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM)); - @EqualsAndHashCode.Include - List treeId; - - TextBlock textBlock; - DocumentTree documentTree; - - @Builder.Default - Set entities = new HashSet<>(); - - Map bBoxCache; - - - @Override - public TextBlock getTextBlock() { - - if (textBlock == null) { - textBlock = GenericSemanticNode.super.getTextBlock(); - } - return textBlock; - } - - - @Override - public String toString() { - - return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary(); - } - - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = GenericSemanticNode.super.getBBox(); - } - return bBoxCache; - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Document.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Document.java deleted file mode 100644 index 9db7e3a..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Document.java +++ /dev/null @@ -1,171 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.DocumentTree; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -/** - * Represents the entire document as a node within the document's semantic structure. - */ -@Data -@SuperBuilder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Document extends AbstractSemanticNode { - - Set pages; - Integer numberOfPages; - - @Builder.Default - static final SectionIdentifier sectionIdentifier = SectionIdentifier.document(); - - - @Override - public NodeType getType() { - - return NodeType.DOCUMENT; - } - - - /** - * Gets the sections of the document as a list. - * - * @return A list of all sections within the document. - */ - public List
getAllSections() { - - return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node) - .collect(Collectors.toList()); - } - - - /** - * Gets the main sections of the document as a list. - * - * @return A list of main sections within the document - * @deprecated This method is marked for removal. - * Use {@link #streamChildrenOfType(NodeType)} instead, - * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. - */ - @Deprecated(forRemoval = true) - public List
getMainSections() { - - return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node) - .collect(Collectors.toList()); - } - - - /** - * Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects. - * - * @return A list of all children of type SECTION or SUPER_SECTION. - */ - public List getChildrenOfTypeSectionOrSuperSection() { - - return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) - .toList(); - } - - - /** - * Streams all terminal (leaf) text blocks within the document in their natural order. - * - * @return A stream of terminal {@link TextBlock}. - */ - public Stream streamTerminalTextBlocksInOrder() { - - return streamAllNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getTextBlock); - } - - - @Override - public List getTreeId() { - - return Collections.emptyList(); - } - - - @Override - public void setTreeId(List tocId) { - - throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents"); - } - - - @Override - public SectionIdentifier getSectionIdentifier() { - - return sectionIdentifier; - } - - - @Override - public Headline getHeadline() { - - return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node) - .findFirst() - .orElseGet(Headline::empty); - } - - - /** - * Streams all nodes within the document, regardless of type, in their natural order. - * - * @return A stream of all {@link SemanticNode} within the document. - */ - private Stream streamAllNodes() { - - return getDocumentTree().allEntriesInOrder() - .map(DocumentTree.Entry::getNode); - } - - - /** - * Streams all image nodes contained within the document. - * - * @return A stream of {@link Image} nodes. - */ - public Stream streamAllImages() { - - return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node); - } - - - @Override - public String toString() { - - return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary(); - } - - - @Override - public Map getBBox() { - - Map bBox = new HashMap<>(); - for (Page page : pages) { - bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight())); - } - return bBox; - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/DuplicatedParagraph.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/DuplicatedParagraph.java deleted file mode 100644 index 634a653..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/DuplicatedParagraph.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; - -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.SuperBuilder; - -@Data -@EqualsAndHashCode(callSuper = true) -@SuperBuilder -public class DuplicatedParagraph extends Paragraph { - - TextBlock unsortedLeafTextBlock; - - - @Override - public TextBlock getTextBlock() { - - return Stream.of(leafTextBlock, unsortedLeafTextBlock) - .collect(new TextBlockCollector()); - - } - - - @Override - public String toString() { - - return super.toString(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Footer.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Footer.java deleted file mode 100644 index 819c1e5..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Footer.java +++ /dev/null @@ -1,62 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - - - -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -@Data -@SuperBuilder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Footer extends AbstractSemanticNode { - - final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty(); - - TextBlock leafTextBlock; - - - @Override - public NodeType getType() { - - return NodeType.FOOTER; - } - - - @Override - public boolean isLeaf() { - - return true; - } - - - @Override - public TextBlock getTextBlock() { - - return leafTextBlock; - } - - - @Override - public SectionIdentifier getSectionIdentifier() { - - return sectionIdentifier; - } - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/GenericSemanticNode.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/GenericSemanticNode.java deleted file mode 100644 index b35647a..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/GenericSemanticNode.java +++ /dev/null @@ -1,5 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -public interface GenericSemanticNode extends SemanticNode { - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Header.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Header.java deleted file mode 100644 index f3cd0d1..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Header.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - - - -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -/** - * Represents the header part of a document page. - */ -@Data -@SuperBuilder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Header extends AbstractSemanticNode { - - final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty(); - - TextBlock leafTextBlock; - - - @Override - public boolean isLeaf() { - - return true; - } - - - @Override - public NodeType getType() { - - return NodeType.HEADER; - } - - - @Override - public TextBlock getTextBlock() { - - return leafTextBlock; - } - - - @Override - public SectionIdentifier getSectionIdentifier() { - - return sectionIdentifier; - } - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Headline.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Headline.java deleted file mode 100644 index 854df99..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Headline.java +++ /dev/null @@ -1,100 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - - - - -import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -/** - * Represents a headline in a document. - */ -@Data -@SuperBuilder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Headline extends AbstractSemanticNode { - - TextBlock leafTextBlock; - SectionIdentifier sectionIdentifier; - - - @Override - public NodeType getType() { - - return NodeType.HEADLINE; - } - - - @Override - public boolean isLeaf() { - - return true; - } - - - @Override - public TextBlock getTextBlock() { - - return leafTextBlock; - } - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary(); - } - - - @Override - public Headline getHeadline() { - - return this; - } - - - @Override - public SectionIdentifier getSectionIdentifier() { - - if (sectionIdentifier == null) { - sectionIdentifier = SectionIdentifier.fromSearchText(getTextBlock().getSearchText()); - } - return sectionIdentifier; - } - - - /** - * Creates an empty headline with no text content. - * - * @return An empty {@link Headline} instance. - */ - public static Headline empty() { - - return Headline.builder().leafTextBlock(AtomicTextBlock.empty(-1L, 0, new Page(), -1, null)).build(); - } - - - /** - * Checks if this headline is associated with any paragraphs within its parent section or node. - * - * @return True if there are paragraphs associated with this headline, false otherwise. - */ - public boolean hasParagraphs() { - - return getParent().streamAllSubNodesOfType(NodeType.PARAGRAPH) - .findFirst() - .isPresent(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Image.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Image.java deleted file mode 100644 index 30c24f5..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Image.java +++ /dev/null @@ -1,140 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.Collections; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.llm.service.document.TextRange; -import com.knecon.fforesight.llm.service.document.entity.IEntity; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -/** - * Represents an image within the document. - */ -@Data -@SuperBuilder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Image extends AbstractSemanticNode implements IEntity { - - String id; - - TextBlock leafTextBlock; - - ImageType imageType; - boolean transparent; - Rectangle2D position; - - Page page; - - - @Override - public NodeType getType() { - - return NodeType.IMAGE; - } - - - @Override - public TextBlock getTextBlock() { - - return leafTextBlock; - } - - - @Override - public Set getPages() { - - return Collections.singleton(page); - } - - - @Override - public TextRange getTextRange() { - - return leafTextBlock.getTextRange(); - } - - - @Override - public int length() { - - return getTextRange().length(); - } - - - @Override - public String type() { - - return getType().toString().toLowerCase(Locale.ENGLISH); - } - - - @Override - public String toString() { - - return getTreeId() + ": " + getValue() + " " + position; - } - - - @Override - public Map getBBox() { - - Map bBoxPerPage = new HashMap<>(); - bBoxPerPage.put(page, position); - return bBoxPerPage; - } - - - @Override - public String getValue() { - - return NodeType.IMAGE + ":" + camelCase(imageType.toString()); - } - - - private String camelCase(String name) { - - return name.charAt(0) + name.substring(1).toLowerCase(Locale.ENGLISH); - } - - - public boolean mostlyContainedBy(Image image, double containmentThreshold) { - - Map bboxImage = image.getBBox(); - Map bbox = this.getBBox(); - //image needs to be on the same page - if (bboxImage.get(this.page) != null) { - Rectangle2D intersection = bboxImage.get(this.page).createIntersection(bbox.get(this.page)); - double calculatedIntersection = intersection.getWidth() * intersection.getHeight(); - double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight(); - return (calculatedIntersection / area) > containmentThreshold; - } - return false; - } - - - public boolean mostlyContains(Image image, double containmentThreshold) { - - Map bboxImage = image.getBBox(); - Map bbox = this.getBBox(); - Rectangle2D intersection = bbox.get(this.page).createIntersection(bboxImage.get(this.page)); - double calculatedIntersection = intersection.getWidth() * intersection.getHeight(); - double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight(); - return (area / calculatedIntersection) > containmentThreshold; - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/ImageType.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/ImageType.java deleted file mode 100644 index 1d81231..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/ImageType.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.util.Locale; - -public enum ImageType { - LOGO, - FORMULA, - SIGNATURE, - OTHER, - OCR, - GRAPHIC; - - - public static ImageType fromString(String imageType) { - - return switch (imageType.toLowerCase(Locale.ROOT)) { - case "logo" -> ImageType.LOGO; - case "formula" -> ImageType.FORMULA; - case "signature" -> ImageType.SIGNATURE; - case "ocr" -> ImageType.OCR; - case "graphic" -> ImageType.GRAPHIC; - default -> ImageType.OTHER; - }; - } -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/NodeType.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/NodeType.java deleted file mode 100644 index b8654dc..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/NodeType.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.util.Locale; - -public enum NodeType { - DOCUMENT, - SECTION, - SUPER_SECTION, - HEADLINE, - PARAGRAPH, - TABLE, - TABLE_CELL, - IMAGE, - HEADER, - FOOTER; - - - public String toString() { - - return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ENGLISH); - } -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java deleted file mode 100644 index 5911148..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Page.java +++ /dev/null @@ -1,94 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.llm.service.document.entity.TextEntity; -import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.NoArgsConstructor; -import lombok.Setter; -import lombok.experimental.FieldDefaults; - -/** - * Represents a single page in a document. - */ -@Getter -@Setter -@Builder -@NoArgsConstructor -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true) -public class Page { - - @EqualsAndHashCode.Include - Integer number; - Integer height; - Integer width; - Integer rotation; - - - List textBlocksOnPage; - Header header; - Footer footer; - - @Builder.Default - Set entities = new HashSet<>(); - - @Builder.Default - Set images = new HashSet<>(); - - - /** - * Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body. - * - * @return The main body text block. - */ - public TextBlock getMainBodyTextBlock() { - - return textBlocksOnPage.stream() - .filter(atb -> !atb.isEmpty()) - .collect(new TextBlockCollector()); - } - - - /** - * Retrieves the highest SemanticNodes, which appear only on this page. It is achieved by traversing the DocumentTree up, until a SemanticNode's direct parent is no longer exclusively on this page. - * - * @return A list which contains the highes SemanticNodes, which appear only on this page. - */ - public List getMainBody() { - - return textBlocksOnPage.stream() - .map(AtomicTextBlock::getParent) - .map(this::getHighestParentOnlyOnPage) - .distinct() - .toList(); - } - - private SemanticNode getHighestParentOnlyOnPage(SemanticNode node) { - - SemanticNode currentNode = node; - while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) { - currentNode = currentNode.getParent(); - } - return currentNode; - } - - - @Override - public String toString() { - - return String.valueOf(number); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Paragraph.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Paragraph.java deleted file mode 100644 index 0da9c51..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Paragraph.java +++ /dev/null @@ -1,54 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - - - -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -/** - * Represents a paragraph in the document. - */ -@Data -@SuperBuilder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PROTECTED) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Paragraph extends AbstractSemanticNode { - - TextBlock leafTextBlock; - - - @Override - public NodeType getType() { - - return NodeType.PARAGRAPH; - } - - - @Override - public boolean isLeaf() { - - return true; - } - - - @Override - public TextBlock getTextBlock() { - - return leafTextBlock; - } - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Section.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Section.java deleted file mode 100644 index 5e15a46..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Section.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; -import lombok.extern.slf4j.Slf4j; - -/** - * Represents a section within a document, encapsulating both its textual content and semantic structure. - */ -@Slf4j -@Data -@SuperBuilder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class Section extends AbstractSemanticNode { - - - @Override - public NodeType getType() { - - return NodeType.SECTION; - } - - - /** - * Checks if this section contains any tables. - * - * @return True if the section contains at least one table, false otherwise. - */ - public boolean hasTables() { - - return streamAllSubNodesOfType(NodeType.TABLE).findAny() - .isPresent(); - } - - - @Override - public SectionIdentifier getSectionIdentifier() { - - return getHeadline().getSectionIdentifier(); - } - - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary(); - } - - - public Headline getHeadline() { - - return streamChildrenOfType(NodeType.HEADLINE)// - .map(node -> (Headline) node)// - .findFirst()// - .orElseGet(() -> getParent().getHeadline()); - } - - - /** - * Checks if any headline within this section or its sub-nodes contains a given string. - * - * @param value The string to search for within headlines, case-sensitive. - * @return True if at least one headline contains the specified string, false otherwise. - */ - public boolean anyHeadlineContainsString(String value) { - - return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value)); - } - - - /** - * Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive. - * - * @param value The string to search for within headlines, case-insensitive. - * @return True if at least one headline contains the specified string, false otherwise. - */ - public boolean anyHeadlineContainsStringIgnoreCase(String value) { - - return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value)); - } - - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SectionIdentifier.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SectionIdentifier.java deleted file mode 100644 index d0c8026..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SectionIdentifier.java +++ /dev/null @@ -1,158 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.experimental.FieldDefaults; - -/** - * Represents a unique identifier for a section within a document. - */ -@AllArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class SectionIdentifier { - - static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); - - private enum Format { - EMPTY, - NUMERICAL, - DOCUMENT - } - - Format format; - String identifierString; - List identifiers; - boolean asChild; - - - /** - * Generates a SectionIdentifier from the headline text of a section, determining its format and structure. - * - * @param headline The headline text from which to generate the section identifier. - * @return A {@link SectionIdentifier} instance corresponding to the headline text. - */ - public static SectionIdentifier fromSearchText(String headline) { - - if (headline == null || headline.isEmpty() || headline.isBlank()) { - return SectionIdentifier.empty(); - } - - Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline); - if (numericalIdentifierMatcher.find()) { - return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher); - } - // more formats here - return SectionIdentifier.empty(); - } - - - /** - * Marks the current section identifier as a child of another section. - * - * @param sectionIdentifier The parent section identifier. - * @return A new {@link SectionIdentifier} instance marked as a child. - */ - public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) { - - return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true); - } - - - /** - * Generates a SectionIdentifier that represents the entire document. - * - * @return A {@link SectionIdentifier} with a document-wide scope. - */ - public static SectionIdentifier document() { - - return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false); - } - - - /** - * Generates an empty SectionIdentifier. - * - * @return An empty {@link SectionIdentifier} instance. - */ - public static SectionIdentifier empty() { - - return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false); - } - - - private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) { - - String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end()); - List identifiers = new LinkedList<>(); - for (int i = 1; i <= 4; i++) { - String numericalIdentifier = numericalIdentifierMatcher.group(i); - if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) { - break; - } - identifiers.add(Integer.parseInt(numericalIdentifier.trim())); - } - return new SectionIdentifier(Format.NUMERICAL, - identifierString, - identifiers.stream() - .toList(), - false); - } - - - /** - * Determines if the current section is the parent of the given section. - * - * @param sectionIdentifier The section identifier to compare against. - * @return true if the current section is the parent of the given section, false otherwise. - */ - public boolean isParentOf(SectionIdentifier sectionIdentifier) { - - if (this.format.equals(Format.EMPTY)) { - return false; - } - if (this.format.equals(Format.DOCUMENT)) { - return true; - } - if (!this.format.equals(sectionIdentifier.format)) { - return false; - } - if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) { - return false; - } - for (int i = 0; i < this.identifiers.size(); i++) { - if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) { - return false; - } - } - return true; - } - - - /** - * Determines if the current section is a child of the given section, based on their identifiers. - * - * @param sectionIdentifier The section identifier to compare against. - * @return True if the current section is a child of the given section, false otherwise. - */ - public boolean isChildOf(SectionIdentifier sectionIdentifier) { - - if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) { - return false; - } - return sectionIdentifier.isParentOf(this); - } - - - @Override - public String toString() { - - return identifierString; - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java deleted file mode 100644 index c525738..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SemanticNode.java +++ /dev/null @@ -1,684 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import static java.lang.String.format; - -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.DocumentTree; -import com.knecon.fforesight.llm.service.document.RectangleTransformations; -import com.knecon.fforesight.llm.service.document.TextRange; -import com.knecon.fforesight.llm.service.document.entity.TextEntity; -import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto; - -public interface SemanticNode { - - /** - * Returns the type of this node, such as Section, Paragraph, etc. - * - * @return NodeType of this node - */ - NodeType getType(); - - - /** - * Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock. - * So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock - * If the Node is a Leaf, the LeafTextBlock will be returned instead. - * - * @return TextBlock containing all AtomicTextBlocks that are located under this Node. - */ - default TextBlock getTextBlock() { - - return streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getTextBlock) - .collect(new TextBlockCollector()); - } - - - /** - * Any Node maintains its own Set of Entities. - * This Set contains all Entities whose TextRange intersects the TextRange of this node. - * - * @return Set of all Entities associated with this Node - */ - Set getEntities(); - - - /** - * Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. - * - * @return Set of PageNodes this node appears on. - */ - default Set getPages() { - - return getTextBlock().getPages(); - } - - - /** - * Finds the first page associated with this Node. - * - * @return Set of PageNodes this node appears on. - */ - default Page getFirstPage() { - - return getTextBlock().getPages() - .stream() - .min(Comparator.comparingInt(Page::getNumber)) - .orElseThrow(); - } - - - /** - * Each AtomicTextBlock is assigned a page, so to get the pages for this TextRange, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. - * - * @return Set of PageNodes this node appears on. - */ - default Set getPages(TextRange textRange) { - - if (!getTextRange().intersects(textRange)) { - throw new IllegalArgumentException(format("%s which was used to query for pages is not intersected in the %s of this node!", textRange, getTextRange())); - } - return getTextBlock().getPages(textRange); - } - - - /** - * Checks if the given page number exists in the list of pages. - * - * @param pageNumber the page number to be checked - * @return true if the page number exists, otherwise false - */ - default boolean onPage(int pageNumber) { - - return getPages().stream() - .anyMatch(page -> page.getNumber() == pageNumber); - } - - - /** - * Returns the DocumentTree Object. - * - * @return the DocumentTree of the Document this node belongs to - */ - DocumentTree getDocumentTree(); - - - /** - * The id is a List of Integers uniquely identifying this node in the DocumentTree. - * - * @return the DocumentTree ID - */ - List getTreeId(); - - - /** - * This should only be used during graph construction. - * - * @param tocId List of Integers - */ - void setTreeId(List tocId); - - - /** - * Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children. - * If no Headline is found this way, it will recursively traverse the tree up and try again until it hits the root, where it will perform a BFS. - * If no Headline exists anywhere in the Document a dummy Headline is returned. - * - * @return First Headline found. - */ - default Headline getHeadline() { - - return getParent().getHeadline(); - } - - - /** - * Returns a SectionIdentifier, such that it acts as a child of the first Headline associated with this SemanticNode. - * - * @return The SectionIdentifier from the first Headline. - */ - default SectionIdentifier getSectionIdentifier() { - - return SectionIdentifier.asChildOf(getHeadline().getSectionIdentifier()); - } - - - /** - * Checks if its TreeId has a length greater than zero. - * - * @return boolean indicating whether this Node has a Parent in the DocumentTree - */ - default boolean hasParent() { - - return getDocumentTree().hasParentById(getTreeId()); - } - - - /** - * @return The SemanticNode representing the Parent in the DocumentTree - * throws NotFoundException, when no parent is present - */ - default SemanticNode getParent() { - - return getDocumentTree().getParentEntryById(getTreeId()).getNode(); - } - - - /** - * @return The SemanticNode which is directly underneath the document and also under which this node is. - * if this is the highest child node or the document itself, it returns itself. - */ - default SemanticNode getHighestParent() { - - return getDocumentTree().getHighestParentById(getTreeId()); - } - - - /** - * Returns the next sibling node of this SemanticNode in the document tree, if any. - * If there is no next sibling node, an empty Optional is returned. - * - * @return Optional containing the next sibling node, or empty if there is none - */ - default Optional getNextSibling() { - - return getDocumentTree().getNextSibling(getTreeId()); - } - - - /** - * Returns the previous sibling node of this SemanticNode in the document tree, if any. - * If there is no previous sibling node, an empty Optional is returned. - * - * @return Optional containing the previous sibling node, or empty if there is none - */ - default Optional getPreviousSibling() { - - return getDocumentTree().getPreviousSibling(getTreeId()); - } - - - /** - * Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden. - * Currently only Sections, Images, and Tables are not leaves. - * A TableCell might be a leaf depending on its area compared to the page. - * - * @return boolean, indicating if a Node has direct access to a TextBlock - */ - default boolean isLeaf() { - - return false; - } - - - /** - * Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden. - * Currently only Sections and Tables are no leaves. - * - * @return AtomicTextBlock - */ - default TextBlock getLeafTextBlock() { - - throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!"); - } - - - /** - * Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode. - * - * @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode - */ - default void setLeafTextBlock(TextBlock textBlock) { - - throw new UnsupportedOperationException(); - } - - - /** - * Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node. - * If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1. - * - * @return Integer representing the number on the page - */ - default Integer getNumberOnPage() { - - TextBlock textBlock = getTextBlock(); - if (!textBlock.getAtomicTextBlocks().isEmpty()) { - return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); - } else { - return -1; - } - } - - - /** - * Checks if the SemanticNode contains any text. - * - * @return true, if this node's TextBlock is not empty - */ - default boolean hasText() { - - return !getTextBlock().isEmpty(); - } - - - /** - * Checks whether this SemanticNode contains the provided String. - * - * @param string A String which the TextBlock might contain - * @return true, if this node's TextBlock contains the string - */ - default boolean containsString(String string) { - - return getTextBlock().getSearchText().contains(string); - } - - Set getEngines(); - - - default void addEngine(LayoutEngineProto.LayoutEngine engine) { - - getEngines().add(engine); - } - - - /** - * Checks whether this SemanticNode contains all the provided Strings. - * - * @param strings A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains all strings - */ - default boolean containsAllStrings(String... strings) { - - return Arrays.stream(strings) - .allMatch(this::containsString); - } - - - /** - * Checks whether this SemanticNode contains any of the provided Strings. - * - * @param strings A List of Strings to check if they are contained in the TextBlock - * @return true, if this node's TextBlock contains any of the provided strings - */ - default boolean containsAnyString(String... strings) { - - return Arrays.stream(strings) - .anyMatch(this::containsString); - } - - - /** - * Checks whether this SemanticNode contains any of the provided Strings. - * - * @param strings A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains any of the strings - */ - default boolean containsAnyString(List strings) { - - return strings.stream() - .anyMatch(this::containsString); - } - - - /** - * Checks whether this SemanticNode contains all the provided Strings case-insensitive. - * - * @param string A String which the TextBlock might contain - * @return true, if this node's TextBlock contains the string case-insensitive - */ - default boolean containsStringIgnoreCase(String string) { - - return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT)); - } - - - /** - * Checks whether this SemanticNode contains any of the provided Strings case-insensitive. - * - * @param strings A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains any of the strings - */ - default boolean containsAnyStringIgnoreCase(String... strings) { - - return Arrays.stream(strings) - .anyMatch(this::containsStringIgnoreCase); - } - - - /** - * Checks whether this SemanticNode contains any of the provided Strings case-insensitive. - * - * @param strings A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains any of the strings - */ - default boolean containsAllStringsIgnoreCase(String... strings) { - - return Arrays.stream(strings) - .allMatch(this::containsStringIgnoreCase); - } - - - /** - * Checks whether this SemanticNode contains exactly the provided String as a word. - * - * @param word - String which the TextBlock might contain - * @return true, if this node's TextBlock contains string - */ - default boolean containsWord(String word) { - - return getTextBlock().getWords() - .stream() - .anyMatch(s -> s.equals(word)); - } - - - /** - * Checks whether this SemanticNode contains exactly the provided String as a word case-insensitive. - * - * @param word - String which the TextBlock might contain - * @return true, if this node's TextBlock contains string - */ - default boolean containsWordIgnoreCase(String word) { - - return getTextBlock().getWords() - .stream() - .map(String::toLowerCase) - .anyMatch(s -> s.equals(word.toLowerCase(Locale.ENGLISH))); - } - - - /** - * Checks whether this SemanticNode contains any of the provided Strings as a word. - * - * @param words - A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains any of the provided strings - */ - default boolean containsAnyWord(String... words) { - - return Arrays.stream(words) - .anyMatch(word -> getTextBlock().getWords() - .stream() - .anyMatch(word::equals)); - } - - - /** - * Checks whether this SemanticNode contains any of the provided Strings as a word case-insensitive. - * - * @param words - A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains any of the provided strings - */ - default boolean containsAnyWordIgnoreCase(String... words) { - - return Arrays.stream(words) - .map(String::toLowerCase) - .anyMatch(word -> getTextBlock().getWords() - .stream() - .map(String::toLowerCase) - .anyMatch(word::equals)); - } - - - /** - * Checks whether this SemanticNode contains all the provided Strings as word. - * - * @param words - A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains all the provided strings - */ - default boolean containsAllWords(String... words) { - - return Arrays.stream(words) - .allMatch(word -> getTextBlock().getWords() - .stream() - .anyMatch(word::equals)); - } - - - /** - * Checks whether this SemanticNode contains all the provided Strings as word case-insensitive. - * - * @param words - A List of Strings which the TextBlock might contain - * @return true, if this node's TextBlock contains all the provided strings - */ - default boolean containsAllWordsIgnoreCase(String... words) { - - return Arrays.stream(words) - .map(String::toLowerCase) - .allMatch(word -> getTextBlock().getWords() - .stream() - .map(String::toLowerCase) - .anyMatch(word::equals)); - } - - - /** - * Checks whether this SemanticNode intersects the provided rectangle. - * - * @param x the lower left corner X value - * @param y the lower left corner Y value - * @param w width - * @param h height - * @param pageNumber the pageNumber of the rectangle - * @return true if intersects, false otherwise - */ - default boolean intersectsRectangle(int x, int y, int w, int h, int pageNumber) { - - return getBBox().entrySet() - .stream() - .filter(entry -> entry.getKey().getNumber() == pageNumber) - .map(Map.Entry::getValue) - .anyMatch(rect -> rect.intersects(x, y, w, h)); - } - - - /** - * This function is used during insertion of EntityNodes into the graph, it checks if the TextRange of the RedactionEntity intersects or even contains the RedactionEntity. - * It sets the fields accordingly and recursively calls this function on all its children. - * - * @param textEntity RedactionEntity, which is being inserted into the graph - */ - default void addThisToEntityIfIntersects(TextEntity textEntity) { - - TextBlock textBlock = getTextBlock(); - if (textBlock.getTextRange().intersects(textEntity.getTextRange())) { - if (textBlock.containsTextRange(textEntity.getTextRange())) { - textEntity.setDeepestFullyContainingNode(this); - } - textEntity.addIntersectingNode(this); - getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange()) - .forEach(node -> node.addThisToEntityIfIntersects(textEntity)); - } - } - - - /** - * Streams all children located directly underneath this node in the DocumentTree. - * - * @return Stream of all children - */ - default Stream streamChildren() { - - return getDocumentTree().childNodes(getTreeId()); - } - - - /** - * Streams all children located directly underneath this node in the DocumentTree of the provided type. - * - * @return Stream of all children - */ - default Stream streamChildrenOfType(NodeType nodeType) { - - return getDocumentTree().childNodesOfType(getTreeId(), nodeType); - } - - - /** - * Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order. - * - * @return Stream of all SubNodes - */ - default Stream streamAllSubNodes() { - - return getDocumentTree().allSubEntriesInOrder(getTreeId()) - .map(DocumentTree.Entry::getNode); - } - - - /** - * Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order. - * - * @return Stream of all SubNodes - */ - default Stream streamAllSubNodesOfType(NodeType nodeType) { - - return getDocumentTree().allSubEntriesInOrder(getTreeId()) - .filter(entry -> entry.getType().equals(nodeType)) - .map(DocumentTree.Entry::getNode); - } - - - /** - * The TextRange is the start and end string offsets in the reading order of the document. - * - * @return TextRange of this Node's TextBlock - */ - default TextRange getTextRange() { - - return getTextBlock().getTextRange(); - } - - - /** - * Returns the length of the text content in this Node's TextBlock. - * - * @return The length of the text content - */ - default int length() { - - return getTextRange().length(); - } - - - /** - * For a given TextRange this function returns a List of rectangle around the text in the range. - * These Rectangles are split either by a new line or by a large gap in the current line. - * This is mainly used to find the positions of TextEntities - * - * @param textRange A TextRange to calculate the positions for. - * @return A Map, where the keys are the pages and the values are a list of rectangles describing the position of words - */ - default Map> getPositionsPerPage(TextRange textRange) { - - if (isLeaf()) { - return getTextBlock().getPositionsPerPage(textRange); - } - Optional containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange); - if (containingChildNode.isEmpty()) { - return getTextBlock().getPositionsPerPage(textRange); - } - return containingChildNode.get().getPositionsPerPage(textRange); - } - - - /** - * If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children. - * If called on the Document, it will return the cropbox of each page - * - * @return Rectangle2D fully encapsulating this Node for each page. - */ - default Map getBBox() { - - if (isLeaf()) { - return getBBoxFromLeafTextBlock(); - } - - return getBBoxFromChildren(); - } - - - /** - * Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page. - * - * @param rectangle2D The rectangle to check if it is contained - * @param pageNumber The Page number on which the rectangle should be checked - * @return boolean - */ - default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) { - - Page helperPage = Page.builder().number(pageNumber).build(); - if (!getPages().contains(helperPage)) { - return false; - } - return getBBox().get(helperPage).contains(rectangle2D); - } - - - /** - * TODO: this produces unwanted results for sections spanning multiple columns. - * Computes the Union of the bounding boxes of all children recursively. - * - * @return The union of the BoundingBoxes of all children - */ - private Map getBBoxFromChildren() { - - Map bBoxPerPage = new HashMap<>(); - List> childrenBBoxes = streamChildren().map(SemanticNode::getBBox) - .toList(); - Set pages = childrenBBoxes.stream() - .flatMap(map -> map.keySet() - .stream()) - .collect(Collectors.toSet()); - for (Page page : pages) { - Rectangle2D bBoxOnPage = childrenBBoxes.stream() - .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) - .map(childBboxPerPage -> childBboxPerPage.get(page)) - .collect(RectangleTransformations.collectBBox()); - bBoxPerPage.put(page, bBoxOnPage); - } - return bBoxPerPage; - } - - - /** - * @return The union of all BoundingBoxes of the TextBlock of this node - */ - private Map getBBoxFromLeafTextBlock() { - - Map bBoxPerPage = new HashMap<>(); - Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks() - .stream() - .collect(Collectors.groupingBy(AtomicTextBlock::getPage)); - atomicTextBlockPerPage.forEach((page, atomicTextBlocks) -> bBoxPerPage.put(page, RectangleTransformations.atomicTextBlockBBox(atomicTextBlocks))); - return bBoxPerPage; - } - - - /** - * Checks wether this SemanticNode appears on a single page only, and if that page is the provided one. - * - * @param page the page to check - * @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false. - */ - default boolean onlyOnPage(Page page) { - - Set pages = getPages(); - return pages.size() == 1 && pages.contains(page); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SuperSection.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SuperSection.java deleted file mode 100644 index b2d0248..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/SuperSection.java +++ /dev/null @@ -1,89 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; -import lombok.extern.slf4j.Slf4j; - -/** - * Represents a section within a document, encapsulating both its textual content and semantic structure. - */ -@Slf4j -@Data -@SuperBuilder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class SuperSection extends AbstractSemanticNode { - - @Override - public NodeType getType() { - - return NodeType.SUPER_SECTION; - } - - - /** - * Checks if this section contains any tables. - * - * @return True if the section contains at least one table, false otherwise. - */ - public boolean hasTables() { - - return streamAllSubNodesOfType(NodeType.TABLE).findAny() - .isPresent(); - } - - - @Override - public SectionIdentifier getSectionIdentifier() { - - return getHeadline().getSectionIdentifier(); - } - - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary(); - } - - - public Headline getHeadline() { - - return streamChildrenOfType(NodeType.HEADLINE)// - .map(node -> (Headline) node)// - .findFirst()// - .orElseGet(() -> getParent().getHeadline()); - } - - - /** - * Checks if any headline within this section or its sub-nodes contains a given string. - * - * @param value The string to search for within headlines, case-sensitive. - * @return True if at least one headline contains the specified string, false otherwise. - */ - public boolean anyHeadlineContainsString(String value) { - - return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value)); - } - - - /** - * Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive. - * - * @param value The string to search for within headlines, case-insensitive. - * @return True if at least one headline contains the specified string, false otherwise. - */ - public boolean anyHeadlineContainsStringIgnoreCase(String value) { - - return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value)); - } - - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java deleted file mode 100644 index 80831f4..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/Table.java +++ /dev/null @@ -1,306 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import static java.lang.String.format; - -import java.awt.geom.Rectangle2D; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.stream.IntStream; -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.DocumentTree; -import com.knecon.fforesight.llm.service.document.entity.TextEntity; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngineProto; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - -/** - * Represents a table within a document. - */ -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true) -public class Table implements SemanticNode { - - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngineProto.LayoutEngine.ALGORITHM)); - @EqualsAndHashCode.Include - List treeId; - DocumentTree documentTree; - - int numberOfRows; - int numberOfCols; - - TextBlock textBlock; - - @Builder.Default - Set entities = new HashSet<>(); - - Map bBoxCache; - - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = SemanticNode.super.getBBox(); - } - return bBoxCache; - } - - - /** - * Streams all entities in this table, that appear in a row, which contains any of the provided strings. - * - * @param strings Strings to check whether a row contains them - * @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings - */ - public Stream streamEntitiesWhereRowContainsStringsIgnoreCase(List strings) { - - return IntStream.range(0, numberOfRows).boxed() - .filter(row -> rowContainsStringsIgnoreCase(row, strings)) - .flatMap(this::streamRow) - .map(TableCell::getEntities) - .flatMap(Collection::stream); - } - - - /** - * Checks whether the specified row contains all the provided strings. - * - * @param row the row to check as an Integer, must be smaller than numberOfRows - * @param strings a list of strings to check for - * @return true, if all strings appear in the provided row - */ - public boolean rowContainsStringsIgnoreCase(Integer row, List strings) { - - String rowText = streamRow(row).map(TableCell::getTextBlock) - .collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT); - return strings.stream() - .map(String::toLowerCase) - .allMatch(rowText::contains); - } - - - /** - * Streams all entities which appear in a row where at least one cell has the provided header and the provided value. - * - * @param header the header value to search for - * @param value the string which the table cell should contain - * @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value. - */ - public Stream streamEntitiesWhereRowHasHeaderAndValue(String header, String value) { - - List vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header)) - .map(TableCell::getCol) - .toList(); - return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream() - .anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value))) - .map(TableCell::getEntities) - .flatMap(Collection::stream); - } - - - /** - * Streams all entities which appear in a row where at least one cell has the provided header and any provided value. - * - * @param header the header value to search for - * @param values the strings which the table cell should contain - * @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value. - */ - public Stream streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List values) { - - List colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header)) - .map(TableCell::getCol) - .toList(); - return streamTableCells().filter(tableCellNode -> colsWithHeader.stream() - .anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values))) - .map(TableCell::getEntities) - .flatMap(Collection::stream); - } - - - /** - * Returns a TableCell at the provided row and column location. - * - * @param row int representing the row, must be smaller than numberOfRows - * @param col int representing the col, must be smaller than numberOfCols - * @return TableCell at the provided location in the table - */ - public TableCell getCell(int row, int col) { - - if (numberOfRows - row < 0 || numberOfCols - col < 0) { - throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols)); - } - int idx = row * numberOfCols + col; - return (TableCell) documentTree.getEntryById(treeId).getChildren().get(idx).getNode(); - } - - - /** - * Streams all TableCells in this Table row-wise. - * - * @return Stream of all TableCells - */ - public Stream streamTableCells() { - - return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node); - } - - - /** - * Streams all TableCells in this Table which have the provided header row-wise. - * - * @return Stream of all TableCells which have the provided header - */ - public Stream streamTableCellsWithHeader(String header) { - - return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header)) - .map(TableCell::getCol) - .flatMap(this::streamCol) - .filter(tableCellNode -> !tableCellNode.isHeader()); - } - - - /** - * Streams all TableCells belonging to the provided column from top down. - * - * @param col int representing the column - * @return Stream of all TableCell in the provided column - */ - public Stream streamCol(int col) { - - return IntStream.range(0, numberOfRows).boxed() - .map(row -> getCell(row, col)); - } - - - /** - * Streams all TableCells belonging to the provided row from left to right. - * - * @param row int representing the row - * @return Stream of all TableCell in the provided row - */ - public Stream streamRow(int row) { - - return IntStream.range(0, numberOfCols).boxed() - .map(col -> getCell(row, col)); - } - - - /** - * Streams all TableCells row-wise and filters them with header == true. - * - * @return Stream of all TableCells with header == true - */ - public Stream streamHeaders() { - - return streamTableCells().filter(TableCell::isHeader); - } - - - /** - * Streams all TableCells of the provided row and column and filters them with header == true. - * - * @param row int representing the row - * @param col int representing the column - * @return Stream of all TableCells with header == true in the provided row or col - */ - public Stream streamHeadersForCell(int row, int col) { - - return Stream.concat(streamRow(row), streamCol(col)) - .filter(TableCell::isHeader); - } - - - /** - * Streams all Headers and checks if any equal the provided string. - * - * @param header string to check the headers for - * @return true, if at least one header equals the provided string - */ - public boolean hasHeader(String header) { - - return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header)); - } - - - /** - * Streams all Headers and checks if any equal the provided string. - * - * @param header string to check the headers for - * @return true, if at least one header equals the provided string - */ - public boolean hasHeaderIgnoreCase(String header) { - - return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock() - .getSearchText() - .strip() - .toLowerCase(Locale.ENGLISH) - .equals(header.toLowerCase(Locale.ENGLISH))); - } - - - /** - * Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value. - * - * @param header string to find header cells - * @param value string to check cells with provided header - * @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value - */ - public boolean hasRowWithHeaderAndValue(String header, String value) { - - return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value)); - } - - - /** - * Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values. - * - * @param header string to find header cells - * @param values List of strings to check cells with provided header - * @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values. - */ - public boolean hasRowWithHeaderAndAnyValue(String header, List values) { - - return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values)); - } - - - @Override - public NodeType getType() { - - return NodeType.TABLE; - } - - - @Override - public TextBlock getTextBlock() { - - if (textBlock == null) { - textBlock = SemanticNode.super.getTextBlock(); - } - return textBlock; - } - - - @Override - public String toString() { - - return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/TableCell.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/TableCell.java deleted file mode 100644 index e76ad2f..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/nodes/TableCell.java +++ /dev/null @@ -1,84 +0,0 @@ -package com.knecon.fforesight.llm.service.document.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.HashMap; -import java.util.Map; - -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; -import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -/** - * Represents a single table cell within a table. - */ -@Data -@SuperBuilder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true) -public class TableCell extends AbstractSemanticNode { - - int row; - int col; - boolean header; - - Rectangle2D bBox; - - TextBlock leafTextBlock; - - TextBlock textBlock; - - - - @Override - public Map getBBox() { - - Map bBoxPerPage = new HashMap<>(); - getPages().forEach(page -> bBoxPerPage.put(page, bBox)); - return bBoxPerPage; - } - - - @Override - public NodeType getType() { - - return NodeType.TABLE_CELL; - } - - - @Override - public boolean isLeaf() { - - return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty(); - } - - - @Override - public TextBlock getTextBlock() { - - if (isLeaf()) { - return leafTextBlock; - } - - if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); - } - return textBlock; - } - - - @Override - public String toString() { - - return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java deleted file mode 100644 index 8151869..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/AtomicTextBlock.java +++ /dev/null @@ -1,257 +0,0 @@ -package com.knecon.fforesight.llm.service.document.textblock; - -import static java.lang.String.format; - -import java.awt.geom.Rectangle2D; -import java.text.BreakIterator; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import com.knecon.fforesight.llm.service.document.RectangleTransformations; -import com.knecon.fforesight.llm.service.document.TextRange; -import com.knecon.fforesight.llm.service.document.nodes.Page; -import com.knecon.fforesight.llm.service.document.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class AtomicTextBlock implements TextBlock { - - Long id; - Integer numberOnPage; - Page page; - - //string coordinates - TextRange textRange; - String searchText; - List words; - List lineBreaks; - - //position coordinates - List stringIdxToPositionIdx; - @Getter - List positions; - - @EqualsAndHashCode.Exclude - SemanticNode parent; - - - @Override - public int numberOfLines() { - - return lineBreaks.size() + 1; - } - - - public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) { - - return AtomicTextBlock.builder() - .id(textBlockIdx) - .textRange(new TextRange(stringOffset, stringOffset)) - .searchText("") - .lineBreaks(Collections.emptyList()) - .page(page) - .numberOnPage(numberOnPage) - .stringIdxToPositionIdx(Collections.emptyList()) - .positions(Collections.emptyList()) - .parent(parent) - .build(); - } - - - public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextDataProto.DocumentTextData atomicTextBlockData, - DocumentPositionDataProto.DocumentPositionData atomicPositionBlockData, - SemanticNode parent, - Page page) { - - return AtomicTextBlock.builder() - .id(atomicTextBlockData.getId()) - .numberOnPage(atomicTextBlockData.getNumberOnPage()) - .page(page) - .textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd())) - .searchText(atomicTextBlockData.getSearchText()) - .lineBreaks(atomicTextBlockData.getLineBreaksList()) - .stringIdxToPositionIdx(atomicPositionBlockData.getStringIdxToPositionIdxList()) - .positions(toRectangle2DList(atomicPositionBlockData.getPositionsList())) - .parent(parent) - .build(); - } - - - private static List toRectangle2DList(List positions) { - - return positions.stream() - .map(pos -> (Rectangle2D) new Rectangle2D.Float(pos.getValue(0), pos.getValue(1), pos.getValue(2), pos.getValue(3))) - .toList(); - } - - - public TextRange getLineTextRange(int lineNumber) { - - if (lineNumber >= numberOfLines() || lineNumber < 0) { - return new TextRange(textRange.start(), textRange.start()); - } - if (numberOfLines() == 1) { - return textRange; - } - if (lineNumber == 0) { - return new TextRange(textRange.start(), lineBreaks.get(0) + textRange.start()); - } else if (lineNumber == numberOfLines() - 1) { - return new TextRange(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end()); - } - return new TextRange(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start()); - } - - - public List getWords() { - - if (words == null) { - words = new ArrayList<>(); - BreakIterator iterator = BreakIterator.getWordInstance(Locale.ENGLISH); - iterator.setText(searchText); - int start = iterator.first(); - for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { - words.add(searchText.substring(start, end)); - } - } - return words; - } - - - @Override - public List getAtomicTextBlocks() { - - return List.of(this); - } - - - @Override - public int getNextLinebreak(int fromIndex) { - - return lineBreaks.stream()// - .filter(linebreak -> linebreak > fromIndex - textRange.start()) // - .findFirst() // - .orElse(searchText.length()) + textRange.start(); - } - - - @Override - public int getPreviousLinebreak(int fromIndex) { - - return lineBreaks.stream()// - .filter(linebreak -> linebreak <= fromIndex - textRange.start())// - .reduce((a, b) -> b)// - .orElse(0) + textRange.start(); - } - - - @Override - public Rectangle2D getPosition(int stringIdx) { - - return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start())); - } - - - @Override - public List getPositions(TextRange stringTextRange) { - - if (!containsTextRange(stringTextRange)) { - throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange)); - } - if (stringTextRange.length() == 0) { - return Collections.emptyList(); - } - - int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start()); - - if (stringTextRange.end() == this.textRange.end()) { - return positions.subList(startPositionIdx, positions.size()); - } - - return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start())); - - } - - - public Map> getPositionsPerPage(TextRange stringTextRange) { - - List rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange)) - .stream() - .map(this::getPositions) - .map(RectangleTransformations::rectangleBBoxWithGaps) - .flatMap(Collection::stream) - .toList(); - Map> rectanglePerLinePerPage = new HashMap<>(); - rectanglePerLinePerPage.put(page, rectanglesPerLine); - return rectanglePerLinePerPage; - } - - - @Override - public String subSequenceWithLineBreaks(TextRange textRange) { - - if (textRange.length() == 0 || !getTextRange().contains(textRange)) { - return ""; - } - - Set lbInBoundary = lineBreaks.stream() - .map(i -> i + textRange.start()) - .filter(textRange::contains) - .collect(Collectors.toSet()); - if (textRange.end() == getTextRange().end()) { - lbInBoundary.add(getTextRange().end()); - } - StringBuilder sb = new StringBuilder(); - for (int i = textRange.start(); i < textRange.end(); i++) { - char character = this.charAt(i); - if (lbInBoundary.contains(i + 1)) { - // always plus one, due to the linebreaks being an exclusive end index - if (!Character.isWhitespace(character)) { - lbInBoundary.remove(i + 1); - lbInBoundary.add(i + 2); - sb.append(character); - continue; - } - sb.append("\n"); - } else { - sb.append(character); - } - } - return sb.toString(); - } - - - private List getAllLineBreaksInBoundary(TextRange textRange) { - - return getLineBreaks().stream() - .map(linebreak -> linebreak + this.textRange.start()) - .filter(textRange::contains) - .toList(); - } - - - @Override - public String toString() { - - return searchText; - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/ConcatenatedTextBlock.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/ConcatenatedTextBlock.java deleted file mode 100644 index 7762736..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/ConcatenatedTextBlock.java +++ /dev/null @@ -1,268 +0,0 @@ -package com.knecon.fforesight.llm.service.document.textblock; - -import static java.lang.String.format; - -import java.awt.geom.Rectangle2D; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.stream.Stream; - -import com.knecon.fforesight.llm.service.document.TextRange; -import com.knecon.fforesight.llm.service.document.nodes.Page; - -import lombok.AccessLevel; -import lombok.Data; -import lombok.experimental.FieldDefaults; - -@Data -@FieldDefaults(level = AccessLevel.PRIVATE) -public class ConcatenatedTextBlock implements TextBlock { - - List atomicTextBlocks; - String searchText; - TextRange textRange; - - - public static ConcatenatedTextBlock empty() { - - return new ConcatenatedTextBlock(Collections.emptyList()); - } - - - public ConcatenatedTextBlock(List atomicTextBlocks) { - - this.atomicTextBlocks = new LinkedList<>(); - if (atomicTextBlocks.isEmpty()) { - textRange = new TextRange(-1, -1); - return; - } - var firstTextBlock = atomicTextBlocks.get(0); - this.atomicTextBlocks.add(firstTextBlock); - textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end()); - - atomicTextBlocks.subList(1, atomicTextBlocks.size()) - .forEach(this::concat); - } - - - public ConcatenatedTextBlock concat(TextBlock textBlock) { - - if (this.atomicTextBlocks.isEmpty()) { - textRange.setStart(textBlock.getTextRange().start()); - textRange.setEnd(textBlock.getTextRange().end()); - } else if (textRange.end() != textBlock.getTextRange().start()) { - throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange())); - } - this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); - textRange.setEnd(textBlock.getTextRange().end()); - this.searchText = null; - return this; - } - - - private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) { - - return atomicTextBlocks.stream() - .filter(textBlock -> textBlock.getTextRange().contains(stringIdx)) - .findAny() - .orElseThrow(IndexOutOfBoundsException::new); - } - - - private List getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) { - - List intersectingAtomicTextBlocks = new LinkedList<>(); - for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) { - if (atomicTextBlock.getTextRange().start() > textRange.end()) { - break; // early stop, following TextBlocks will never intersect - } - if (atomicTextBlock.getTextRange().intersects(textRange)) { - intersectingAtomicTextBlocks.add(atomicTextBlock); - } - } - return intersectingAtomicTextBlocks; - } - - - @Override - public String getSearchText() { - - if (searchText == null) { - StringBuilder sb = new StringBuilder(); - getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText())); - searchText = sb.toString(); - } - return searchText; - } - - - @Override - public List getWords() { - - return atomicTextBlocks.stream() - .map(AtomicTextBlock::getWords) - .flatMap(Collection::stream) - .toList(); - } - - - @Override - public int numberOfLines() { - - return atomicTextBlocks.stream() - .mapToInt(AtomicTextBlock::numberOfLines).sum(); - } - - - @Override - public int getNextLinebreak(int fromIndex) { - - return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex); - } - - - @Override - public int getPreviousLinebreak(int fromIndex) { - - return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex); - } - - - @Override - public List getLineBreaks() { - - return getAtomicTextBlocks().stream() - .flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks() - .stream()) - .toList(); - } - - - @Override - public Rectangle2D getPosition(int stringIdx) { - - return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx); - } - - - public TextRange getLineTextRange(int lineNumber) { - - if (atomicTextBlocks.size() == 1) { - return atomicTextBlocks.get(0).getLineTextRange(lineNumber); - } - int lineNumberInCurrentBlock = lineNumber; - for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) { - if (lineNumberInCurrentBlock < atomicTextBlock.numberOfLines()) { - return atomicTextBlock.getLineTextRange(lineNumberInCurrentBlock); - } - lineNumberInCurrentBlock -= atomicTextBlock.numberOfLines(); - } - return new TextRange(textRange.start(), textRange.start()); - } - - - @Override - public List getPositions(TextRange stringTextRange) { - - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); - - if (textBlocks.isEmpty()) { - return Collections.emptyList(); - } - if (textBlocks.size() == 1) { - return textBlocks.get(0).getPositions(stringTextRange); - } - - AtomicTextBlock firstTextBlock = textBlocks.get(0); - List positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()))); - - for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { - positions.addAll(textBlock.getPositions()); - } - - var lastTextBlock = textBlocks.get(textBlocks.size() - 1); - positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); - - return positions; - } - - - @Override - public Map> getPositionsPerPage(TextRange stringTextRange) { - - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); - - if (textBlocks.isEmpty()) { - return new HashMap<>(); - } - - if (textBlocks.size() == 1) { - return textBlocks.get(0).getPositionsPerPage(stringTextRange); - } - - AtomicTextBlock firstTextBlock = textBlocks.get(0); - Map> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())); - - for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { - rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange())); - } - - AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1); - rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, - lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), - stringTextRange.end()))); - - return rectanglesPerLinePerPage; - } - - - @Override - public String subSequenceWithLineBreaks(TextRange textRange) { - - if (textRange.length() == 0 || !getTextRange().contains(textRange)) { - return ""; - } - - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(textRange); - - if (textBlocks.size() == 1) { - return textBlocks.get(0).subSequenceWithLineBreaks(textRange); - } - - StringBuilder sb = new StringBuilder(); - AtomicTextBlock firstTextBlock = textBlocks.get(0); - sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(textRange.start(), firstTextBlock.getTextRange().end()))); - - for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { - sb.append(textBlock.searchTextWithLineBreaks()); - } - - var lastTextBlock = textBlocks.get(textBlocks.size() - 1); - sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), textRange.end()))); - - return sb.toString(); - } - - - private Map> mergeEntityPositionsWithSamePageNode(Map> map1, Map> map2) { - - Map> mergedMap = new HashMap<>(map1); - map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, - rectangles, - (l1, l2) -> Stream.concat(l1.stream(), l2.stream()) - .toList())); - return mergedMap; - } - - - @Override - public String toString() { - - return getSearchText(); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlock.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlock.java deleted file mode 100644 index e6f88e2..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlock.java +++ /dev/null @@ -1,176 +0,0 @@ -package com.knecon.fforesight.llm.service.document.textblock; - -import static java.lang.String.format; - -import java.awt.geom.Rectangle2D; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import com.knecon.fforesight.llm.service.document.RectangleTransformations; -import com.knecon.fforesight.llm.service.document.TextRange; -import com.knecon.fforesight.llm.service.document.nodes.Page; - -public interface TextBlock extends CharSequence { - - String getSearchText(); - - - List getWords(); - - - List getAtomicTextBlocks(); - - - TextRange getTextRange(); - - - int getNextLinebreak(int fromIndex); - - - int getPreviousLinebreak(int fromIndex); - - - TextRange getLineTextRange(int lineNumber); - - - List getLineBreaks(); - - - Rectangle2D getPosition(int stringIdx); - - - List getPositions(TextRange stringTextRange); - - - Map> getPositionsPerPage(TextRange stringTextRange); - - - String subSequenceWithLineBreaks(TextRange textRange); - - - int numberOfLines(); - - - default CharSequence getLine(int lineNumber) { - - return subSequence(getLineTextRange(lineNumber)); - } - - - default List getLinePositions(int lineNumber) { - - return getPositions(getLineTextRange(lineNumber)); - } - - - default Rectangle2D getLineBBox(int lineNumber) { - - return RectangleTransformations.rectangle2DBBox(getLinePositions(lineNumber)); - } - - - default String searchTextWithLineBreaks() { - - return subSequenceWithLineBreaks(getTextRange()); - } - - - default int indexOf(String searchTerm) { - - return indexOf(searchTerm, getTextRange().start()); - } - - - default Set getPages() { - - return getAtomicTextBlocks().stream() - .map(AtomicTextBlock::getPage) - .collect(Collectors.toUnmodifiableSet()); - } - - - default Set getPages(TextRange textRange) { - - return getAtomicTextBlocks().stream() - .filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange)) - .map(AtomicTextBlock::getPage) - .collect(Collectors.toUnmodifiableSet()); - } - - - default int indexOf(String searchTerm, int startOffset) { - - int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start()); - if (start == -1) { - return -1; - } - return start + getTextRange().start(); - } - - - default CharSequence getFirstLine() { - - return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start())); - } - - - default boolean containsTextRange(TextRange textRange) { - - if (textRange.end() < textRange.start()) { - throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange)); - } - return getTextRange().contains(textRange); - } - - - default boolean containsIndex(int stringIndex) { - - return getTextRange().contains(stringIndex); - } - - - default CharSequence subSequence(TextRange textRange) { - - return subSequence(textRange.start(), textRange.end()); - } - - - default String buildSummary() { - - String searchText = getSearchText(); - // substring, as splitting very large strings gets expensive - searchText = searchText.substring(0, Math.min(searchText.length(), 200)); - - String[] words = searchText.split(" "); - int bound = Math.min(words.length, 4); - List list = new ArrayList<>(Arrays.asList(words).subList(0, bound)); - - return String.join(" ", list); - } - - - @Override - default CharSequence subSequence(int start, int end) { - - return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start()); - } - - - @Override - default int length() { - - return getTextRange().length(); - } - - - @Override - default char charAt(int index) { - - return getSearchText().charAt(index - getTextRange().start()); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlockCollector.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlockCollector.java deleted file mode 100644 index 3db64e6..0000000 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/document/textblock/TextBlockCollector.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.knecon.fforesight.llm.service.document.textblock; - -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; - -import lombok.NoArgsConstructor; - -@NoArgsConstructor -public class TextBlockCollector implements Collector { - - @Override - public Supplier supplier() { - - return ConcatenatedTextBlock::empty; - } - - - @Override - public BiConsumer accumulator() { - - return ConcatenatedTextBlock::concat; - } - - - @Override - public BinaryOperator combiner() { - - return ConcatenatedTextBlock::concat; - } - - - @Override - public Function finisher() { - - return a -> a; - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT); - } - -} diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/models/Chunk.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/models/Chunk.java index cb440f0..30fd781 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/models/Chunk.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/models/Chunk.java @@ -3,11 +3,12 @@ package com.knecon.fforesight.llm.service.models; import java.util.List; import java.util.Optional; +import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; +import com.iqser.red.service.redaction.v1.server.model.document.textblock.ConsecutiveTextBlockCollector; +import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.ChunkingResponseData; -import com.knecon.fforesight.llm.service.document.ConsecutiveTextBlockCollector; -import com.knecon.fforesight.llm.service.document.DocumentTree; -import com.knecon.fforesight.llm.service.document.nodes.Document; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; + import lombok.extern.slf4j.Slf4j; @@ -20,7 +21,7 @@ public record Chunk(String markdown, List parts) { } - private static List getChunkParts(com.knecon.fforesight.llm.service.document.nodes.Document document, List> treeIds) { + private static List getChunkParts(Document document, List> treeIds) { return treeIds.stream() .map(treeId -> { diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/DocumentBuilderService.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/DocumentBuilderService.java index 83a7ea7..d03830a 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/DocumentBuilderService.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/DocumentBuilderService.java @@ -1,41 +1,21 @@ package com.knecon.fforesight.llm.service.services; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; -import java.util.Arrays; -import java.util.stream.Collectors; - import org.springframework.stereotype.Service; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.primitives.Floats; +import com.iqser.red.service.redaction.v1.server.data.DocumentData; +import com.iqser.red.service.redaction.v1.server.data.DocumentPageProto; +import com.iqser.red.service.redaction.v1.server.data.DocumentPositionDataProto; +import com.iqser.red.service.redaction.v1.server.data.DocumentStructureProto; +import com.iqser.red.service.redaction.v1.server.data.DocumentStructureWrapper; +import com.iqser.red.service.redaction.v1.server.data.DocumentTextDataProto; +import com.iqser.red.service.redaction.v1.server.mapper.DocumentGraphMapper; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.llm.service.LlmNerMessage; -import com.knecon.fforesight.llm.service.document.DocumentData; -import com.knecon.fforesight.llm.service.document.DocumentGraphMapper; -import com.knecon.fforesight.llm.service.document.nodes.Document; -import com.knecon.fforesight.llm.service.utils.StorageIdUtils; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPageProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionDataProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructureWrapper; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextDataProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.EntryDataProto; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeTypeProto; import com.knecon.fforesight.tenantcommons.TenantContext; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; -import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; @@ -46,229 +26,24 @@ import lombok.extern.slf4j.Slf4j; public class DocumentBuilderService { StorageService storageService; - ObjectMapper mapper; + public Document build(LlmNerMessage llmNerMessage) { DocumentData documentData = new DocumentData(); - documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(fetchDocumentStructure(llmNerMessage.getDocumentStructureStorageId()))); - documentData.setDocumentTextData(fetchDocumentTextData(llmNerMessage.getDocumentTextStorageId())); - documentData.setDocumentPositionData(fetchDocumentPositionData(llmNerMessage.getDocumentPositionStorageId())); - documentData.setDocumentPages(fetchAllDocumentPages(llmNerMessage.getDocumentPagesStorageId())); + documentData.setDocumentStructureWrapper(new DocumentStructureWrapper(storageService.readProtoObject(TenantContext.getTenantId(), + llmNerMessage.getDocumentStructureStorageId(), + DocumentStructureProto.DocumentStructure.parser()))); + documentData.setDocumentTextData(storageService.readProtoObject(TenantContext.getTenantId(), + llmNerMessage.getDocumentTextStorageId(), + DocumentTextDataProto.AllDocumentTextData.parser())); + documentData.setDocumentPositionData(storageService.readProtoObject(TenantContext.getTenantId(), + llmNerMessage.getDocumentPositionStorageId(), + DocumentPositionDataProto.AllDocumentPositionData.parser())); + documentData.setDocumentPages(storageService.readProtoObject(TenantContext.getTenantId(), + llmNerMessage.getDocumentPagesStorageId(), + DocumentPageProto.AllDocumentPages.parser())); return DocumentGraphMapper.toDocumentGraph(documentData); } - - private DocumentStructureProto.DocumentStructure fetchDocumentStructure(String storageId) { - - DocumentStructureProto.DocumentStructure documentStructure; - StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); - - if (storageInfo.fileTypeExtension().contains("proto")) { - documentStructure = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentStructureProto.DocumentStructure.parser()); - } else { - DocumentStructure oldDocumentStructure = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentStructure.class); - if (oldDocumentStructure == null) { - return null; - } - documentStructure = convertDocumentStructure(oldDocumentStructure); - } - - return documentStructure; - } - - - private DocumentTextDataProto.AllDocumentTextData fetchDocumentTextData(String storageId) { - - DocumentTextDataProto.AllDocumentTextData documentTextData; - StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); - - if (storageInfo.fileTypeExtension().contains("proto")) { - documentTextData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentTextDataProto.AllDocumentTextData.parser()); - } else { - DocumentTextData[] oldDocumentTextData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentTextData[].class); - if (oldDocumentTextData == null) { - return null; - } - documentTextData = convertAllDocumentTextData(oldDocumentTextData); - } - - return documentTextData; - } - - - private DocumentPositionDataProto.AllDocumentPositionData fetchDocumentPositionData(String storageId) { - - DocumentPositionDataProto.AllDocumentPositionData documentPositionData; - StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); - - if (storageInfo.fileTypeExtension().contains("proto")) { - documentPositionData = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPositionDataProto.AllDocumentPositionData.parser()); - } else { - DocumentPositionData[] oldDocumentPositionData = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPositionData[].class); - if (oldDocumentPositionData == null) { - return null; - } - documentPositionData = convertAllDocumentPositionData(oldDocumentPositionData); - } - - return documentPositionData; - } - - - private DocumentPageProto.AllDocumentPages fetchAllDocumentPages(String storageId) { - - DocumentPageProto.AllDocumentPages allDocumentPages; - StorageIdUtils.StorageInfo storageInfo = StorageIdUtils.parseStorageId(storageId); - - if (storageInfo.fileTypeExtension().contains("proto")) { - allDocumentPages = storageService.readProtoObject(TenantContext.getTenantId(), storageId, DocumentPageProto.AllDocumentPages.parser()); - } else { - DocumentPage[] oldDocumentPages = getOldData(storageInfo.dossierId(), storageInfo.fileId(), storageInfo.fileTypeName(), DocumentPage[].class); - if (oldDocumentPages == null) { - return null; - } - allDocumentPages = convertAllDocumentPages(oldDocumentPages); - } - - return allDocumentPages; - } - - - private T getOldData(String dossierId, String fileId, String fileType, Class valueType) { - - String oldStorageId = StorageIdUtils.getStorageId(dossierId, fileId, fileType, ".json"); - System.out.println("----------------> LOOKING FOR " + oldStorageId); - try (InputStream inputStream = getObject(TenantContext.getTenantId(), oldStorageId)) { - return mapper.readValue(inputStream, valueType); - } catch (IOException e) { - log.error("Could not read JSON for " + fileType + ", error was: " + e); - return null; - } - } - - - private static EntryDataProto.EntryData convertEntryData(DocumentStructure.EntryData oldEntryData) { - - EntryDataProto.EntryData.Builder builder = EntryDataProto.EntryData.newBuilder(); - - builder.setType(NodeTypeProto.NodeType.valueOf(oldEntryData.getType().name())); - builder.addAllTreeId(Arrays.stream(oldEntryData.getTreeId()).boxed() - .collect(Collectors.toList())); - builder.addAllAtomicBlockIds(Arrays.asList(oldEntryData.getAtomicBlockIds())); - builder.addAllPageNumbers(Arrays.asList(oldEntryData.getPageNumbers())); - - builder.putAllProperties(oldEntryData.getProperties()); - - if (oldEntryData.getChildren() != null) { - oldEntryData.getChildren() - .forEach(child -> builder.addChildren(convertEntryData(child))); - } - - return builder.build(); - } - - - private static DocumentStructureProto.DocumentStructure convertDocumentStructure(DocumentStructure oldStructure) { - - DocumentStructureProto.DocumentStructure.Builder newBuilder = DocumentStructureProto.DocumentStructure.newBuilder(); - - if (oldStructure.getRoot() != null) { - newBuilder.setRoot(convertEntryData(oldStructure.getRoot())); - } - - return newBuilder.build(); - } - - - private static DocumentPageProto.DocumentPage convertDocumentPage(DocumentPage oldPage) { - - return DocumentPageProto.DocumentPage.newBuilder() - .setNumber(oldPage.getNumber()) - .setHeight(oldPage.getHeight()) - .setWidth(oldPage.getWidth()) - .setRotation(oldPage.getRotation()) - .build(); - } - - - private static DocumentPageProto.AllDocumentPages convertAllDocumentPages(DocumentPage[] oldPages) { - - DocumentPageProto.AllDocumentPages.Builder allPagesBuilder = DocumentPageProto.AllDocumentPages.newBuilder(); - - for (DocumentPage oldPage : oldPages) { - DocumentPageProto.DocumentPage newPage = convertDocumentPage(oldPage); - allPagesBuilder.addDocumentPages(newPage); - } - - return allPagesBuilder.build(); - } - - - private static DocumentPositionDataProto.DocumentPositionData convertDocumentPositionData(DocumentPositionData oldData) { - - DocumentPositionDataProto.DocumentPositionData.Builder builder = DocumentPositionDataProto.DocumentPositionData.newBuilder() - .setId(oldData.getId()) - .addAllStringIdxToPositionIdx(Arrays.stream(oldData.getStringIdxToPositionIdx()).boxed() - .collect(Collectors.toList())); - - for (float[] pos : oldData.getPositions()) { - DocumentPositionDataProto.DocumentPositionData.Position position = DocumentPositionDataProto.DocumentPositionData.Position.newBuilder() - .addAllValue(Floats.asList(pos)) - .build(); - builder.addPositions(position); - } - - return builder.build(); - } - - - private static DocumentPositionDataProto.AllDocumentPositionData convertAllDocumentPositionData(DocumentPositionData[] oldDataList) { - - DocumentPositionDataProto.AllDocumentPositionData.Builder allDataBuilder = DocumentPositionDataProto.AllDocumentPositionData.newBuilder(); - - for (DocumentPositionData oldData : oldDataList) { - allDataBuilder.addDocumentPositionData(convertDocumentPositionData(oldData)); - } - - return allDataBuilder.build(); - } - - - private static DocumentTextDataProto.DocumentTextData convertDocumentTextData(DocumentTextData oldData) { - - DocumentTextDataProto.DocumentTextData.Builder builder = DocumentTextDataProto.DocumentTextData.newBuilder() - .setId(oldData.getId()) - .setPage(oldData.getPage()) - .setSearchText(oldData.getSearchText()) - .setNumberOnPage(oldData.getNumberOnPage()) - .setStart(oldData.getStart()) - .setEnd(oldData.getEnd()) - .addAllLineBreaks(Arrays.stream(oldData.getLineBreaks()).boxed() - .collect(Collectors.toList())); - - return builder.build(); - } - - - private static DocumentTextDataProto.AllDocumentTextData convertAllDocumentTextData(DocumentTextData[] oldDataList) { - - DocumentTextDataProto.AllDocumentTextData.Builder allDataBuilder = DocumentTextDataProto.AllDocumentTextData.newBuilder(); - - for (DocumentTextData oldData : oldDataList) { - allDataBuilder.addDocumentTextData(convertDocumentTextData(oldData)); - } - - return allDataBuilder.build(); - } - - - @SneakyThrows - private InputStream getObject(String tenantId, String storageId) { - - File tempFile = File.createTempFile("temp", ".data"); - storageService.downloadTo(tenantId, storageId, tempFile); - return new BufferedInputStream(Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE)); - } - } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java index f5e76a5..a715507 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/services/LlmNerService.java @@ -22,6 +22,8 @@ import com.azure.ai.openai.models.ChatRequestUserMessage; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; +import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.llm.service.ChunkingResponse; import com.knecon.fforesight.llm.service.EntityAiDescription; @@ -29,8 +31,6 @@ import com.knecon.fforesight.llm.service.LlmNerEntities; import com.knecon.fforesight.llm.service.LlmNerEntity; import com.knecon.fforesight.llm.service.LlmNerMessage; import com.knecon.fforesight.llm.service.SystemMessageProvider; -import com.knecon.fforesight.llm.service.document.nodes.Document; -import com.knecon.fforesight.llm.service.document.textblock.TextBlock; import com.knecon.fforesight.llm.service.models.Chunk; import com.knecon.fforesight.llm.service.utils.FormattingUtils; import com.knecon.fforesight.tenantcommons.TenantContext; @@ -130,16 +130,15 @@ public class LlmNerService { try { entitiesWithUsage = mapEntitiesToDocument(chatCompletions, chunk.parts()); } catch (JsonProcessingException e) { - String faultyResponse = chatCompletions.getChoices() - .get(0).getMessage().getContent(); + String faultyResponse = chatCompletions.getChoices().get(0).getMessage().getContent(); ChatCompletions correctionCompletions = runLLM(SystemMessageProvider.PROMPT_CORRECTION, faultyResponse); try { entitiesWithUsage = mapEntitiesToDocument(correctionCompletions, chunk.parts()); - int completionTokens = chatCompletions.getUsage().getCompletionTokens() + correctionCompletions.getUsage().getCompletionTokens(); - int promptTokens = chatCompletions.getUsage().getPromptTokens() + correctionCompletions.getUsage().getPromptTokens(); - entitiesWithUsage = new EntitiesWithUsage(entitiesWithUsage.entities(), completionTokens, promptTokens); + entitiesWithUsage = new EntitiesWithUsage(entitiesWithUsage.entities(), + entitiesWithUsage.promptTokens() + chatCompletions.getUsage().getPromptTokens(), + entitiesWithUsage.completionTokens() + chatCompletions.getUsage().getCompletionTokens()); } catch (JsonProcessingException ex) { throw new RuntimeException(ex); @@ -165,11 +164,10 @@ public class LlmNerService { private EntitiesWithUsage mapEntitiesToDocument(ChatCompletions chatCompletions, List chunkParts) throws JsonProcessingException { - EntitiesWithUsage allEntities = new EntitiesWithUsage(new LinkedList<>(), chatCompletions.getUsage().getCompletionTokens(), chatCompletions.getUsage().getPromptTokens()); + EntitiesWithUsage allEntities = new EntitiesWithUsage(new LinkedList<>(), chatCompletions.getUsage().getPromptTokens(), chatCompletions.getUsage().getCompletionTokens()); if (!chatCompletions.getChoices().isEmpty()) { - ChatChoice choice = chatCompletions.getChoices() - .get(0); + ChatChoice choice = chatCompletions.getChoices().get(0); Map> entitiesPerType = parseResponse(choice); List entitiesFromResponse = entitiesPerType.entrySet() @@ -236,7 +234,7 @@ public class LlmNerService { } - private record EntitiesWithUsage(List entities, int completionTokens, int promptTokens) { + private record EntitiesWithUsage(List entities, int promptTokens, int completionTokens) { } diff --git a/llm-service/llm-service-server/build.gradle.kts b/llm-service/llm-service-server/build.gradle.kts index 2bd9321..4d5db28 100644 --- a/llm-service/llm-service-server/build.gradle.kts +++ b/llm-service/llm-service-server/build.gradle.kts @@ -36,7 +36,7 @@ dependencies { implementation("com.knecon.fforesight:keycloak-commons:0.30.0") { exclude(group = "com.knecon.fforesight", module = "tenant-commons") } - implementation("com.knecon.fforesight:tenant-commons:0.30.0") + implementation("com.knecon.fforesight:tenant-commons:0.31.0") implementation("com.knecon.fforesight:swagger-commons:0.7.0") implementation("ch.qos.logback:logback-classic") diff --git a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantExchangeMessageReceiverImpl.java b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantExchangeMessageReceiverImpl.java deleted file mode 100644 index 18fe904..0000000 --- a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantExchangeMessageReceiverImpl.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.knecon.fforesight.llm.service.queue; - -import java.util.Map; -import java.util.Set; - -import org.springframework.amqp.rabbit.annotation.RabbitHandler; -import org.springframework.amqp.rabbit.annotation.RabbitListener; -import org.springframework.boot.context.event.ApplicationReadyEvent; -import org.springframework.context.event.EventListener; -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.llm.service.QueueNames; -import com.knecon.fforesight.tenantcommons.TenantProvider; -import com.knecon.fforesight.tenantcommons.model.TenantCreatedEvent; -import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration; -import com.knecon.fforesight.tenantcommons.model.TenantResponse; -import com.knecon.fforesight.tenantcommons.queue.RabbitQueueFromExchangeService; -import com.knecon.fforesight.tenantcommons.queue.TenantExchangeMessageReceiver; - -@Service -public class TenantExchangeMessageReceiverImpl extends TenantExchangeMessageReceiver { - - public TenantExchangeMessageReceiverImpl(RabbitQueueFromExchangeService rabbitQueueService, TenantProvider tenantProvider) { - - super(rabbitQueueService, tenantProvider); - } - - - @Override - protected Set getTenantQueueConfigs() { - - return Set.of(TenantQueueConfiguration.builder() - .listenerId(MessageHandler.LLM_NER_REQUEST_LISTENER_ID) - .exchangeName(QueueNames.LLM_NER_REQUEST_EXCHANGE) - .queuePrefix(QueueNames.LLM_NER_REQUEST_QUEUE_PREFIX) - .dlqName(QueueNames.LLM_NER_DLQ) - .arguments(Map.of("x-max-priority", 2)) - .build()); - } - - - @EventListener(ApplicationReadyEvent.class) - public void onApplicationReady() { - - System.out.println("application ready invoked"); - super.initializeQueues(); - } - - - @RabbitHandler - @RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantCreatedQueueName()}") - public void reactToTenantCreation(TenantCreatedEvent tenantCreatedEvent) { - - super.reactToTenantCreation(tenantCreatedEvent); - } - - - @RabbitHandler - @RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantDeletedQueueName()}") - public void reactToTenantDeletion(TenantResponse tenantResponse) { - - super.reactToTenantDeletion(tenantResponse); - - } - -} - diff --git a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantMessagingConfigurationImpl.java b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantMessagingConfigurationImpl.java deleted file mode 100644 index 631fa25..0000000 --- a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantMessagingConfigurationImpl.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.knecon.fforesight.llm.service.queue; - -import org.springframework.context.annotation.Configuration; - -import com.knecon.fforesight.tenantcommons.queue.TenantMessagingConfiguration; - -@Configuration -public class TenantMessagingConfigurationImpl extends TenantMessagingConfiguration { - - -} \ No newline at end of file diff --git a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantQueueProviderConfig.java b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantQueueProviderConfig.java new file mode 100644 index 0000000..bd2cc77 --- /dev/null +++ b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/TenantQueueProviderConfig.java @@ -0,0 +1,28 @@ +package com.knecon.fforesight.llm.service.queue; + +import java.util.Map; +import java.util.Set; + +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import com.knecon.fforesight.llm.service.QueueNames; +import com.knecon.fforesight.tenantcommons.model.TenantQueueProvider; + +@Configuration +public class TenantQueueProviderConfig { + + @Bean + protected TenantQueueProvider getTenantQueueConfigs() { + + return new TenantQueueProvider(Set.of(com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration.builder() + .listenerId(MessageHandler.LLM_NER_REQUEST_LISTENER_ID) + .exchangeName(QueueNames.LLM_NER_REQUEST_EXCHANGE) + .queuePrefix(QueueNames.LLM_NER_REQUEST_QUEUE_PREFIX) + .dlqName(QueueNames.LLM_NER_DLQ) + .arguments(Map.of("x-max-priority", 2)) + .build())); + } + +} + diff --git a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/websocket/WebSocketMessagingService.java b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/websocket/WebSocketMessagingService.java index 13e8099..389eb12 100644 --- a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/websocket/WebSocketMessagingService.java +++ b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/websocket/WebSocketMessagingService.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.llm.service.websocket; import org.springframework.messaging.simp.SimpMessagingTemplate; -import org.springframework.security.core.parameters.P; import org.springframework.stereotype.Service; import com.knecon.fforesight.llm.service.services.WebSocketMessagingTemplate; diff --git a/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java b/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java index 6affa98..87f9c0a 100644 --- a/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java +++ b/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java @@ -37,7 +37,7 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest { @SneakyThrows public void testLlmNer() { - Path folder = Path.of("/Users/maverickstuder/Downloads/10-09-2024-16-03-47_files_list"); + Path folder = Path.of("/home/kschuettler/Downloads/New Folder (5)/18299ec0-7659-496a-a44a-194bbffb1700/1fb7d49ae389469f60db516cf81a3510"); LlmNerMessage message = prepStorage(folder); llmNerService.runNer(message); Path tmpFile = Path.of("/private/tmp", "LLM_ENTITIES", "entities.json"); diff --git a/llm-service/llm-service-server/src/test/resources/application.yaml b/llm-service/llm-service-server/src/test/resources/application.yaml index d244bc2..081c234 100644 --- a/llm-service/llm-service-server/src/test/resources/application.yaml +++ b/llm-service/llm-service-server/src/test/resources/application.yaml @@ -13,5 +13,5 @@ keyword-service.url: "http://mock.url" llm-service: - azureOpenAiKey: "Your key here" + azureOpenAiKey: "679b023858314dfe807e50a2e7d86d63" azureOpenAiEndpoint: "https://knecon-ca-demo.openai.azure.com/" diff --git a/llm-service/llm-service-server/tmp/AAA_LLM_ENTITIES/entities.json b/llm-service/llm-service-server/tmp/AAA_LLM_ENTITIES/entities.json deleted file mode 100644 index 47b094d..0000000 --- a/llm-service/llm-service-server/tmp/AAA_LLM_ENTITIES/entities.json +++ /dev/null @@ -1 +0,0 @@ -{"entities":[{"value":"Kalt R.","type":"PII","startOffset":1951,"endOffset":1958},{"value":"Kalt R.","type":"PII","startOffset":3338,"endOffset":3345},{"value":"Kalt R.","type":"PII","startOffset":3476,"endOffset":3483},{"value":"Kalt R.","type":"PII","startOffset":3821,"endOffset":3828},{"value":"Jackson W.A.","type":"PII","startOffset":2286,"endOffset":2298},{"value":"Jackson W.A.","type":"PII","startOffset":2790,"endOffset":2802},{"value":"Jackson W.A.","type":"PII","startOffset":2911,"endOffset":2923},{"value":"Jackson W.A.","type":"PII","startOffset":3096,"endOffset":3108},{"value":"Kalt R.","type":"PII","startOffset":5055,"endOffset":5062},{"value":"Kalt R.","type":"PII","startOffset":5233,"endOffset":5240},{"value":"Kalt R.","type":"PII","startOffset":5895,"endOffset":5902},{"value":"Kalt R.","type":"PII","startOffset":5909,"endOffset":5916},{"value":"Kalt R.","type":"PII","startOffset":5931,"endOffset":5938},{"value":"Kalt R.","type":"PII","startOffset":5960,"endOffset":5967},{"value":"Kalt R.","type":"PII","startOffset":5989,"endOffset":5996},{"value":"Kalt R.","type":"PII","startOffset":6018,"endOffset":6025},{"value":"Kalt R.","type":"PII","startOffset":7253,"endOffset":7260},{"value":"Kalt R.","type":"PII","startOffset":7281,"endOffset":7288},{"value":"Kalt R.","type":"PII","startOffset":7309,"endOffset":7316},{"value":"Kalt R.","type":"PII","startOffset":7337,"endOffset":7344},{"value":"Kalt R. 2009c","type":"PII","startOffset":10056,"endOffset":10069},{"value":"Kalt R.","type":"PII","startOffset":10767,"endOffset":10774},{"value":"Kalt R.","type":"PII","startOffset":10780,"endOffset":10787},{"value":"Kalt R.","type":"PII","startOffset":10802,"endOffset":10809},{"value":"Kalt R.","type":"PII","startOffset":10830,"endOffset":10837},{"value":"Kalt R.","type":"PII","startOffset":10858,"endOffset":10865},{"value":"Kalt R.","type":"PII","startOffset":10886,"endOffset":10893},{"value":"Kalt R.","type":"PII","startOffset":11980,"endOffset":11987},{"value":"Kalt R.","type":"PII","startOffset":12008,"endOffset":12015},{"value":"Kalt R.","type":"PII","startOffset":12036,"endOffset":12043},{"value":"Kalt R.","type":"PII","startOffset":12064,"endOffset":12071},{"value":"Kalt R.","type":"PII","startOffset":13814,"endOffset":13821},{"value":"Kalt R.","type":"PII","startOffset":14598,"endOffset":14605},{"value":"Kalt R.","type":"PII","startOffset":14855,"endOffset":14862},{"value":"Kalt R.","type":"PII","startOffset":15149,"endOffset":15156},{"value":"Kalt R.","type":"PII","startOffset":15481,"endOffset":15488},{"value":"Kalt R. 2009c","type":"PII","startOffset":16392,"endOffset":16405},{"value":"Kalt R.","type":"PII","startOffset":17850,"endOffset":17857},{"value":"Kalt R.","type":"PII","startOffset":18284,"endOffset":18291},{"value":"Kalt R.","type":"PII","startOffset":18932,"endOffset":18939},{"value":"Kalt R.","type":"PII","startOffset":19412,"endOffset":19419},{"value":"Kalt R.","type":"PII","startOffset":19660,"endOffset":19667},{"value":"Kalt R.","type":"PII","startOffset":19973,"endOffset":19980},{"value":"Kalt R.","type":"PII","startOffset":20246,"endOffset":20253},{"value":"Kalt R.","type":"PII","startOffset":20522,"endOffset":20529},{"value":"Jackson W.","type":"PII","startOffset":19197,"endOffset":19207},{"value":"Briswalter C.","type":"PII","startOffset":20778,"endOffset":20791},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19003,"endOffset":19052},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19529,"endOffset":19578},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19776,"endOffset":19825},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20046,"endOffset":20095},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20362,"endOffset":20411},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20638,"endOffset":20687},{"value":"Syngenta Technology & Projects, Huddersfield, United Kingdom","type":"ADDRESS","startOffset":19265,"endOffset":19325},{"value":"Syngenta Crop Protection AG, Basel, Switzerland","type":"ADDRESS","startOffset":20809,"endOffset":20856}]} \ No newline at end of file