SPIKE: LLM-NER

This commit is contained in:
Kilian Schuettler 2024-08-26 18:51:16 +02:00
parent 5ac4ff9ff7
commit 32c618c35b
88 changed files with 5782 additions and 276 deletions

View File

@ -1,4 +1,23 @@
variables:
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

View File

@ -1,149 +0,0 @@
import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
plugins {
java
id("org.springframework.boot") version "3.3.2"
id("io.spring.dependency-management") version "1.1.6"
id("org.sonarqube") version "4.4.1.3373"
id("io.freefair.lombok") version "8.6"
pmd
checkstyle
jacoco
}
group = "com.knecon.fforesight"
java.sourceCompatibility = JavaVersion.VERSION_17
configurations {
compileOnly {
extendsFrom(configurations.annotationProcessor.get())
}
}
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${projectDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${projectDir}/config/pmd/test_pmd.xml")
}
tasks.jacocoTestReport {
reports {
xml.required.set(false)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
repositories {
mavenLocal()
mavenCentral()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/");
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
tasks.register("publish") {
}
tasks.named<BootBuildImage>("bootBuildImage") {
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
if (project.hasProperty("buildbootDockerHostNetwork")) {
network.set("host")
}
docker {
if (project.hasProperty("buildbootDockerHostNetwork")) {
bindHostToBuilder.set(true)
}
verboseLogging.set(true)
publishRegistry {
username.set(providers.gradleProperty("mavenUser").getOrNull())
password.set(providers.gradleProperty("mavenPassword").getOrNull())
email.set(providers.gradleProperty("mavenEmail").getOrNull())
url.set("https://nexus.knecon.com:5001/")
}
}
}
configurations {
all {
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
exclude(group = "commons-logging", module = "commons-logging")
}
}
extra["springCloudVersion"] = "2022.0.5"
extra["testcontainersVersion"] = "1.20.0"
dependencies {
implementation("org.springframework.boot:spring-boot-starter-actuator")
implementation("org.springframework.boot:spring-boot-starter-amqp")
implementation("org.springframework.boot:spring-boot-starter-web")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign")
implementation("org.springframework.boot:spring-boot-starter-websocket")
implementation("org.springframework.security:spring-security-messaging:6.1.3")
implementation("com.iqser.red.commons:storage-commons:2.49.0")
implementation("com.knecon.fforesight:keycloak-commons:0.29.0")
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
implementation("com.azure:azure-ai-openai:1.0.0-beta.5")
developmentOnly("org.springframework.boot:spring-boot-devtools")
annotationProcessor("org.springframework.boot:spring-boot-configuration-processor")
testImplementation("org.springframework.boot:spring-boot-starter-test")
testImplementation("org.springframework.amqp:spring-rabbit-test")
implementation("ch.qos.logback:logback-classic")
}
dependencyManagement {
imports {
mavenBom("org.testcontainers:testcontainers-bom:${property("testcontainersVersion")}")
mavenBom("org.springframework.cloud:spring-cloud-dependencies:${property("springCloudVersion")}")
}
}
tasks.withType<Test> {
minHeapSize = "1024m"
maxHeapSize = "2048m"
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
}
sonarqube {
properties {
providers.gradleProperty("sonarToken").getOrNull()?.let { property("sonar.login", it) }
property("sonar.host.url", "https://sonarqube.knecon.com")
}
}
tasks.test {
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
}
tasks.jacocoTestReport {
dependsOn(tasks.test) // tests are required to run before generating the report
reports {
xml.required.set(true)
csv.required.set(false)
}
}

View File

@ -0,0 +1,7 @@
plugins {
`kotlin-dsl`
}
repositories {
gradlePluginPortal()
}

View File

@ -0,0 +1,86 @@
plugins {
`java-library`
`maven-publish`
pmd
checkstyle
jacoco
}
group = "com.knecon.fforesight"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "2048m"
}
tasks.test {
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
}
tasks.jacocoTestReport {
dependsOn(tasks.test) // tests are required to run before generating the report
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
publishing {
publications {
create<MavenPublication>(name) {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull()
password = providers.gradleProperty("mavenPassword").getOrNull()
}
}
}
}
}
java {
withJavadocJar()
}
repositories {
mavenLocal()
mavenCentral()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull()
password = providers.gradleProperty("mavenPassword").getOrNull()
}
}
}

View File

@ -19,6 +19,7 @@
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>

View File

@ -1,16 +1,20 @@
<?xml version="1.0"?>
<ruleset name="Custom Rules"
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 https://pmd.sourceforge.io/ruleset_2_0_0.xsd">
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>Knecon test pmd rules</description>
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="DataflowAnomalyAnalysis"/>
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="BeanMembersShouldSerialize"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -0,0 +1,5 @@
plugins {
`maven-publish`
id("com.knecon.fforesight.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.llm.service.api.model;
package com.knecon.fforesight.llm.service;
import lombok.AllArgsConstructor;
import lombok.Builder;

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.llm.service;
import java.util.List;
import java.util.Map;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ChunkingResponse {
Map<String, String> targetFilePath;
String responseFilePath;
List<ChunkingResponseData> data;
}

View File

@ -0,0 +1,26 @@
package com.knecon.fforesight.llm.service;
import java.util.List;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ChunkingResponseData {
Integer chunkId;
String text;
List<String> types;
List<List<Integer>> treeIds;
float[] embedding;
Integer tokenCount;
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.llm.service.api;
package com.knecon.fforesight.llm.service;
import java.time.OffsetDateTime;

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.llm.service;
import java.util.List;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LlmNerEntities {
List<LlmNerEntity> entities;
}

View File

@ -0,0 +1,22 @@
package com.knecon.fforesight.llm.service;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LlmNerEntity {
String value;
String type;
int startOffset;
int endOffset;
}

View File

@ -0,0 +1,29 @@
package com.knecon.fforesight.llm.service;
import java.util.Map;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LlmNerMessage {
Map<String, String> identifier;
String chunksStorageId;
String documentStructureStorageId;
String documentTextStorageId;
String documentPositionStorageId;
String documentPagesStorageId;
String resultStorageId;
}

View File

@ -0,0 +1,24 @@
package com.knecon.fforesight.llm.service;
import java.util.Map;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LlmNerResponseMessage {
Map<String, String> identifier;
int promptTokens;
int completionTokens;
int duration;
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.llm.service.api.model;
package com.knecon.fforesight.llm.service;
import java.util.ArrayList;
import java.util.List;

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.llm.service;
public class QueueNames {
public static final String LLM_NER_SERVICE_QUEUE = "llm_entity_request_queue";
public static final String LLM_NER_SERVICE_RESPONSE_QUEUE = "llm_entity_response_queue";
public static final String LLM_NER_SERVICE_DLQ = "llm_entity_dead_letter_queue";
}

View File

@ -0,0 +1,22 @@
plugins {
id("com.knecon.fforesight.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}
configurations {
all {
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
}
}
extra["springCloudVersion"] = "2022.0.5"
extra["testcontainersVersion"] = "1.20.0"
dependencies {
implementation(project(":llm-service-api"))
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.159.0")
implementation("com.iqser.red.commons:storage-commons:2.49.0")
implementation("org.springframework.boot:spring-boot-starter:3.1.1")
implementation("com.knecon.fforesight:tenant-commons:0.21.0")
implementation("com.azure:azure-ai-openai:1.0.0-beta.10")
implementation("ch.qos.logback:logback-classic:1.5.7")
}

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.llm.service;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
@Configuration
@ComponentScan
@EnableConfigurationProperties(LlmServiceSettings.class)
public class LlmServiceConfiguration {
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.llm.service.settings;
package com.knecon.fforesight.llm.service;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
@ -7,19 +7,12 @@ import org.springframework.context.annotation.Primary;
import lombok.Data;
@Data
@Primary
@Configuration
@ConfigurationProperties("llm-service")
public class LlmServiceSettings {
private String requestQueueName = "llm_request_queue";
private String responseQueueName = "llm_response_queue";
private String deadLetterQueueName = "llm_dead_letter_queue";
private String azureOpenAiKey;
private String azureOpenAiEndpoint;
private String model = "gpt-4-cqs-dev";
private String model = "gpt-4o-mini";
private int concurrency = 8;
}

View File

@ -1,7 +1,28 @@
package com.knecon.fforesight.llm.service.model;
package com.knecon.fforesight.llm.service;
public class SystemMessages {
public static final String NER = """
You are tasked with finding all named entities in the following document.
The named entities should be mapped to two classes PII, ADDRESS, COMPANY, and COUNTRY.
A PII is any personally identifiable information including but not limited to names, email address, telephone number, fax numbers. Further, use your own judgement to add anything else.
Each name should be its own entity, but first name, last name and possibly middle name should be merged. Remember that numbers are never a part of a name.
An Address describes a real life location and should always be as complete as possible.
A COMPANY is any company or approving body mentioned in the text. But only if it's not part of an ADDRESS
A COUNTRY is any country. But only if it's not part of an ADDRESS
The output should be strictly JSON format and nothing else, formatted as such:
```
{
PII: ["Jennifer Durando, BS", "01223 45678", "mimi.lang@smithcorp.com", "+44 (0)1252 392460"],
ADDRESS: ["Product Safety Labs 2394 US Highway 130 Dayton, NJ 08810 USA", "Syngenta Crop Protection, LLC 410 Swing Road Post Office Box 18300 Greensboro, NC 27419-8300 USA"]
COMPANY: ["Syngenta", "EFSA"]
COUNTRY: ["USA"]
}
```
Always replace linebreaks with whitespaces, but except that, ensure the entities match the text in the document exactly.
It is important you mention all present entities, more importantly, it is preferable to mention too many than too little.
""";
public static String RULES_CO_PILOT = """
From now on, you are a Drools rule generator. This means you will start your answer with a step-by-step explanation how to write a rule, which will fulfill the prompt, followed by the rule.

View File

@ -0,0 +1,70 @@
package com.knecon.fforesight.llm.service.document;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.google.common.base.Functions;
public class ConsecutiveBoundaryCollector implements Collector<TextRange, List<TextRange>, List<TextRange>> {
@Override
public Supplier<List<TextRange>> supplier() {
return LinkedList::new;
}
@Override
public BiConsumer<List<TextRange>, TextRange> accumulator() {
return (existingList, boundary) -> {
if (existingList.isEmpty()) {
existingList.add(boundary);
return;
}
TextRange prevTextRange = existingList.get(existingList.size() - 1);
if (prevTextRange.end() > boundary.start()) {
throw new IllegalArgumentException(String.format("Can't concatenate %s and %s. Boundaries must be ordered!", prevTextRange, boundary));
}
if (prevTextRange.end() == boundary.start()) {
existingList.remove(existingList.size() - 1);
existingList.add(TextRange.merge(List.of(prevTextRange, boundary)));
} else {
existingList.add(boundary);
}
};
}
@Override
public BinaryOperator<List<TextRange>> combiner() {
return (list1, list2) -> {
list1.addAll(list2);
return list1;
};
}
@Override
public Function<List<TextRange>, List<TextRange>> finisher() {
return Functions.identity();
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH);
}
}

View File

@ -0,0 +1,76 @@
package com.knecon.fforesight.llm.service.document;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Stream;
import com.google.common.base.Functions;
import com.knecon.fforesight.llm.service.document.textblock.ConcatenatedTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class ConsecutiveTextBlockCollector implements Collector<TextBlock, List<ConcatenatedTextBlock>, List<TextBlock>> {
@Override
public Supplier<List<ConcatenatedTextBlock>> supplier() {
return LinkedList::new;
}
@Override
public BiConsumer<List<ConcatenatedTextBlock>, TextBlock> accumulator() {
return (existingList, textBlock) -> {
if (existingList.isEmpty()) {
ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty();
ctb.concat(textBlock);
existingList.add(ctb);
return;
}
ConcatenatedTextBlock prevBlock = existingList.get(existingList.size() - 1);
if (prevBlock.getTextRange().end() == textBlock.getTextRange().start()) {
prevBlock.concat(textBlock);
} else {
ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty();
ctb.concat(textBlock);
existingList.add(ctb);
}
};
}
@Override
public BinaryOperator<List<ConcatenatedTextBlock>> combiner() {
return (list1, list2) -> Stream.concat(list1.stream(), list2.stream())
.toList();
}
@Override
public Function<List<ConcatenatedTextBlock>, List<TextBlock>> finisher() {
return a -> a.stream()
.map(tb -> (TextBlock) tb)
.toList();
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH);
}
}

View File

@ -0,0 +1,29 @@
package com.knecon.fforesight.llm.service.document;
import java.io.Serializable;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentData implements Serializable {
DocumentPage[] documentPages;
DocumentTextData[] documentTextData;
DocumentPositionData[] documentPositionData;
DocumentStructure documentStructure;
}

View File

@ -0,0 +1,229 @@
package com.knecon.fforesight.llm.service.document;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.nodes.DuplicatedParagraph;
import com.knecon.fforesight.llm.service.document.nodes.Footer;
import com.knecon.fforesight.llm.service.document.nodes.Header;
import com.knecon.fforesight.llm.service.document.nodes.Headline;
import com.knecon.fforesight.llm.service.document.nodes.Image;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.Paragraph;
import com.knecon.fforesight.llm.service.document.nodes.Section;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.llm.service.document.nodes.SuperSection;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphMapper {
public Document toDocumentGraph(DocumentData documentData) {
Document document = new Document();
DocumentTree documentTree = new DocumentTree(document);
Context context = new Context(documentData, documentTree);
context.pageData.addAll(Arrays.stream(documentData.getDocumentPages())
.map(DocumentGraphMapper::buildPage)
.toList());
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
document.setDocumentTree(context.documentTree);
document.setPages(new HashSet<>(context.pageData));
document.setNumberOfPages(documentData.getDocumentPages().length);
document.setTextBlock(document.getTextBlock());
return document;
}
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
for (DocumentStructure.EntryData entryData : entries) {
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
.map(pageNumber -> getPage(pageNumber, context))
.toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case SUPER_SECTION -> buildSuperSection(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (entryData.getAtomicBlockIds().length > 0) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
node.setLeafTextBlock(textBlock);
}
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
.toList();
if (entryData.getEngines() != null) {
entryData.getEngines()
.forEach(node::addEngine);
} else {
entryData.setEngines(Collections.emptySet());
}
node.setTreeId(treeId);
switch (entryData.getType()) {
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
default -> pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
}
return newEntries;
}
private Headline buildHeadline(Context context) {
return Headline.builder().documentTree(context.documentTree).build();
}
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
assert pageNumbers.length == 1;
Page page = getPage(pageNumbers[0], context);
var builder = Image.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.documentTree(context.documentTree).page(page).build();
}
private TableCell buildTableCell(Context context, Map<String, String> properties) {
TableCell.TableCellBuilder<?, ?> builder = TableCell.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Table buildTable(Context context, Map<String, String> properties) {
Table.TableBuilder builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.documentTree(context.documentTree).build();
}
private Footer buildFooter(Context context) {
return Footer.builder().documentTree(context.documentTree).build();
}
private Header buildHeader(Context context) {
return Header.builder().documentTree(context.documentTree).build();
}
private Section buildSection(Context context) {
return Section.builder().documentTree(context.documentTree).build();
}
private SuperSection buildSuperSection(Context context) {
return SuperSection.builder().documentTree(context.documentTree).build();
}
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build();
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
.collect(new TextBlockCollector());
}
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
}
private Page buildPage(DocumentPage p) {
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private Page getPage(Long pageIndex, Context context) {
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
assert page.getNumber() == Math.toIntExact(pageIndex);
return page;
}
static final class Context {
private final DocumentTree documentTree;
private final List<Page> pageData;
private final List<DocumentTextData> documentTextData;
private final List<DocumentPositionData> documentPositionData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pageData = new ArrayList<>();
this.documentTextData = Arrays.stream(documentData.getDocumentTextData())
.toList();
this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData())
.toList();
}
}
}

View File

@ -0,0 +1,371 @@
package com.knecon.fforesight.llm.service.document;
import static java.lang.String.format;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.nodes.GenericSemanticNode;
import com.knecon.fforesight.llm.service.document.nodes.NodeType;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode)
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root;
for (int id : treeId) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream()
.map(Entry::getNode);
}
/**
* Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search.
* It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes.
*
* @param treeId the treeId of the Entry whose children shall be checked.
* @param textRange The TextRange to find intersecting childNodes for.
* @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange
*/
public List<SemanticNode> findIntersectingChildNodes(List<Integer> treeId, TextRange textRange) {
List<Entry> childEntries = getEntryById(treeId).getChildren();
List<SemanticNode> intersectingChildEntries = new LinkedList<>();
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
if (startIdx < 0) {
return intersectingChildEntries;
}
for (int i = startIdx; i < childEntries.size(); i++) {
if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) {
intersectingChildEntries.add(childEntries.get(i).getNode());
} else {
break;
}
}
return intersectingChildEntries;
}
public Optional<SemanticNode> findFirstContainingChild(List<Integer> treeId, TextRange textRange) {
List<Entry> childEntries = getEntryById(treeId).getChildren();
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
if (startIdx < 0) {
return Optional.empty();
}
if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) {
return Optional.of(childEntries.get(startIdx).getNode());
}
return Optional.empty();
}
public Optional<TableCell> findTableCellInTable(List<Integer> treeId, int start, int end) {
return findTableCellInTableRecursively(getEntryById(treeId).getChildren(), start, end);
}
private Optional<TableCell> findTableCellInTableRecursively(List<Entry> entries, int start, int end) {
int startIdx = findFirstIdxOfContainingChildBinarySearch(entries, start);
if (startIdx < 0) {
return Optional.empty();
}
Entry entry = entries.get(startIdx);
if (entry.getNode().getTextRange().contains(end) && entry.getNode() instanceof TableCell tableCell) {
if (!entry.getNode().isLeaf()) {
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
if (foundInChildren.isPresent()) {
return foundInChildren;
}
}
return Optional.of(tableCell);
}
if (!entry.getNode().isLeaf()) {
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
if (foundInChildren.isPresent()) {
return foundInChildren;
}
}
return Optional.empty();
}
private int findFirstIdxOfContainingChildBinarySearch(List<Entry> childNodes, int start) {
int low = 0;
int high = childNodes.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
TextRange range = childNodes.get(mid).getNode().getTextRange();
if (range.start() > start) {
high = mid - 1;
} else if (range.end() <= start) {
low = mid + 1;
} else {
return mid;
}
}
return -1;
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream()
.filter(entry -> entry.node.getType().equals(nodeType))
.map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Optional<SemanticNode> getNextSibling(List<Integer> treeId) {
var siblingTreeId = getNextSiblingId(treeId);
if (!entryExists(siblingTreeId)) {
return Optional.empty();
}
return Optional.of(getEntryById(siblingTreeId).getNode());
}
public List<Integer> getNextSiblingId(List<Integer> treeId) {
List<Integer> siblingTreeId = new LinkedList<>();
for (int i = 0; i < treeId.size() - 1; i++) {
siblingTreeId.add(treeId.get(i));
}
siblingTreeId.add(treeId.get(treeId.size() - 1) + 1);
return siblingTreeId;
}
public Optional<SemanticNode> getPreviousSibling(List<Integer> treeId) {
var siblingTreeId = getPreviousSiblingId(treeId);
if (!entryExists(siblingTreeId)) {
return Optional.empty();
}
return Optional.of(getEntryById(siblingTreeId).getNode());
}
public List<Integer> getPreviousSiblingId(List<Integer> treeId) {
List<Integer> siblingTreeId = new LinkedList<>();
for (int i = 0; i < treeId.size() - 1; i++) {
siblingTreeId.add(treeId.get(i));
}
siblingTreeId.add(treeId.get(treeId.size() - 1) - 1);
return siblingTreeId;
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root;
for (int id : treeId) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root)
.flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream()
.flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n",
allEntriesInOrder().map(Entry::toString)
.toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry),
entry.children.stream()
.flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new ArrayList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -0,0 +1,72 @@
package com.knecon.fforesight.llm.service.document;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.llm.service.document.nodes.Image;
import com.knecon.fforesight.llm.service.document.nodes.ImageType;
import com.knecon.fforesight.llm.service.document.nodes.Table;
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PropertiesMapper {
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
builder.imageType(ImageType.fromString(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE)));
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT)));
builder.position(parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION)));
builder.id(properties.get(DocumentStructure.ImageProperties.ID));
}
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW)));
builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL)));
builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER)));
builder.bBox(parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX)));
}
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
}
private Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(DocumentStructure.RECTANGLE_DELIMITER))
.map(Float::parseFloat)
.toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
.map(Long::valueOf)
.toArray(Long[]::new);
}
}

View File

@ -0,0 +1,175 @@
package com.knecon.fforesight.llm.service.document;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
public class RectangleTransformations {
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
.stream())
.collect(new Rectangle2DBBoxCollector());
}
public static Rectangle2D rectangle2DBBox(Collection<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream()
.collect(new Rectangle2DBBoxCollector());
}
/**
* If two rectangles are further apart than five times the average width of a rectangle, a gap is inserted.
*
* @param rectangle2DList A list of rectangles to combine
* @return A list of rectangles which are combined if they are closer than the split threshold
*/
public static List<Rectangle2D> rectangleBBoxWithGaps(List<Rectangle2D> rectangle2DList) {
if (rectangle2DList.isEmpty()) {
return Collections.emptyList();
}
double splitThreshold = rectangle2DList.stream()
.mapToDouble(RectangularShape::getWidth).average()
.orElse(5) * 5.0;
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
rectangleListsWithGaps.add(rectangleListWithoutGaps);
Rectangle2D previousRectangle = rectangle2DList.get(0);
for (Rectangle2D currentRectangle : rectangle2DList) {
if (Math.abs(currentRectangle.getMinX() - previousRectangle.getMaxX()) > splitThreshold) {
rectangleListWithoutGaps = new LinkedList<>();
rectangleListWithoutGaps.add(currentRectangle);
rectangleListsWithGaps.add(rectangleListWithoutGaps);
previousRectangle = currentRectangle;
} else {
rectangleListWithoutGaps.add(currentRectangle);
previousRectangle = currentRectangle;
}
}
return rectangleListsWithGaps.stream()
.map(RectangleTransformations::rectangle2DBBox)
.toList();
}
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
return new Rectangle2DBBoxCollector();
}
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
@Override
public Supplier<BBox> supplier() {
return BBox::new;
}
@Override
public BiConsumer<BBox, Rectangle2D> accumulator() {
return BBox::addRectangle;
}
@Override
public BinaryOperator<BBox> combiner() {
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
Math.min(b1.lowerLeftY, b2.lowerLeftY),
Math.max(b1.upperRightX, b2.upperRightX),
Math.max(b1.upperRightY, b2.upperRightY));
}
@Override
public Function<BBox, Rectangle2D> finisher() {
return BBox::toRectangle2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.UNORDERED);
}
@AllArgsConstructor
@NoArgsConstructor
private static class BBox {
Double lowerLeftX;
Double lowerLeftY;
Double upperRightX;
Double upperRightY;
public Rectangle2D toRectangle2D() {
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
return new Rectangle2D.Double(0, 0, 0, 0);
}
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
}
public void addRectangle(Rectangle2D rectangle2D) {
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
if (this.lowerLeftX == null) {
this.lowerLeftX = lowerLeftX;
} else if (this.lowerLeftX > lowerLeftX) {
this.lowerLeftX = lowerLeftX;
}
if (this.lowerLeftY == null) {
this.lowerLeftY = lowerLeftY;
} else if (this.lowerLeftY > lowerLeftY) {
this.lowerLeftY = lowerLeftY;
}
if (this.upperRightX == null) {
this.upperRightX = upperRightX;
} else if (this.upperRightX < upperRightX) {
this.upperRightX = upperRightX;
}
if (this.upperRightY == null) {
this.upperRightY = upperRightY;
} else if (this.upperRightY < upperRightY) {
this.upperRightY = upperRightY;
}
}
}
}
}

View File

@ -0,0 +1,250 @@
package com.knecon.fforesight.llm.service.document;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.EqualsAndHashCode;
import lombok.Setter;
/**
* Represents a range of text defined by a start and end index.
* Provides functionality to check containment, intersection, and to adjust ranges based on specified conditions.
*/
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class TextRange implements Comparable<TextRange> {
private int start;
private int end;
/**
* Constructs a TextRange with specified start and end indexes.
*
* @param start The starting index of the range.
* @param end The ending index of the range.
* @throws IllegalArgumentException If start is greater than end.
*/
public TextRange(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
/**
* Returns the length of the text range.
*
* @return The length of the range.
*/
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
/**
* Checks if this {@link TextRange} fully contains another TextRange.
*
* @param textRange The {@link TextRange} to check.
* @return true if this range contains the specified range, false otherwise.
*/
public boolean contains(TextRange textRange) {
return start <= textRange.start() && textRange.end() <= end;
}
/**
* Checks if this {@link TextRange} is fully contained by another TextRange.
*
* @param textRange The {@link TextRange} to check against.
* @return true if this range is contained by the specified range, false otherwise.
*/
public boolean containedBy(TextRange textRange) {
return textRange.contains(this);
}
/**
* Checks if this {@link TextRange} contains another range specified by start and end indices.
*
* @param start The starting index of the range to check.
* @param end The ending index of the range to check.
* @return true if this range fully contains the specified range, false otherwise.
* @throws IllegalArgumentException If the start index is greater than the end index.
*/
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
/**
* Checks if this {@link TextRange} is fully contained within another range specified by start and end indices.
*
* @param start The starting index of the outer range.
* @param end The ending index of the outer range.
* @return true if this range is fully contained within the specified range, false otherwise.
* @throws IllegalArgumentException If the start index is greater than the end index.
*/
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
/**
* Determines if the specified index is within this {@link TextRange}.
*
* @param index The index to check.
* @return true if the index is within the range (inclusive of the start and exclusive of the end), false otherwise.
*/
public boolean contains(int index) {
return start <= index && index < end;
}
/**
* Checks if this {@link TextRange} intersects with another {@link TextRange}.
*
* @param textRange The {@link TextRange} to check for intersection.
* @return true if the ranges intersect, false otherwise.
*/
public boolean intersects(TextRange textRange) {
return textRange.start() < this.end && this.start < textRange.end();
}
/**
* Splits this TextRange into multiple ranges based on a list of indices.
*
* @param splitIndices The indices at which to split the range.
* @return A list of TextRanges resulting from the split.
* @throws IndexOutOfBoundsException If any split index is outside this TextRange.
*/
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream()
.anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
splitIndices.stream()
.filter(idx -> !this.contains(idx))
.toList(),
this));
}
List<TextRange> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) {
int splitIndex = splitIndices.get(i);
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new TextRange(previousIndex, end));
return splitBoundaries;
}
/**
* Merges a collection of TextRanges into a single Text range encompassing all.
*
* @param boundaries The collection of TextRanges to merge.
* @return A new TextRange covering the entire span of the given ranges.
* @throws IllegalArgumentException If boundaries are empty.
*/
public static TextRange merge(Collection<TextRange> boundaries) {
int minStart = boundaries.stream()
.mapToInt(TextRange::start)
.min()
.orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream()
.mapToInt(TextRange::end)
.max()
.orElseThrow(IllegalArgumentException::new);
return new TextRange(minStart, maxEnd);
}
@Override
public String toString() {
return format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(TextRange textRange) {
if (end < textRange.end() && start < textRange.start()) {
return -1;
}
if (start > textRange.start() && end > textRange.end()) {
return 1;
}
return 0;
}
/**
* Shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without trailing or preceding whitespaces.
*
* @param textBlock TextBlock to check whitespaces against
* @return Trimmed boundary
*/
public TextRange trim(TextBlock textBlock) {
if (this.length() == 0) {
return this;
}
int trimmedStart = this.start;
while (textBlock.containsIndex(trimmedStart) && trimmedStart < end && Character.isWhitespace(textBlock.charAt(trimmedStart))) {
trimmedStart++;
}
int trimmedEnd = this.end;
while (textBlock.containsIndex(trimmedEnd - 1) && trimmedStart < trimmedEnd && Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
trimmedEnd--;
}
return new TextRange(trimmedStart, Math.max(trimmedEnd, trimmedStart));
}
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.llm.service.document.entity;
public enum EntityType {
ENTITY,
HINT,
RECOMMENDATION,
FALSE_POSITIVE,
FALSE_RECOMMENDATION,
DICTIONARY_REMOVAL
}

View File

@ -0,0 +1,30 @@
package com.knecon.fforesight.llm.service.document.entity;
import com.knecon.fforesight.llm.service.document.TextRange;
public interface IEntity {
/**
* Gets the value of this entity as a string.
*
* @return The string value.
*/
String getValue();
/**
* Gets the range of text in the document associated with this entity.
*
* @return The text range.
*/
TextRange getTextRange();
/**
* Gets the type of this entity.
*
* @return The entity type.
*/
String type();
}

View File

@ -0,0 +1,46 @@
package com.knecon.fforesight.llm.service.document.entity;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import lombok.experimental.UtilityClass;
@UtilityClass
public final class IdBuilder {
private final HashFunction hashFunction = Hashing.murmur3_128();
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
return buildId(pages.stream()
.map(Page::getNumber)
.collect(Collectors.toList()), rectanglesPerLine, type, entityType);
}
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
StringBuilder sb = new StringBuilder();
sb.append(type).append(entityType);
List<Integer> sortedPageNumbers = pageNumbers.stream()
.sorted(Comparator.comparingInt(Integer::intValue))
.toList();
sortedPageNumbers.forEach(sb::append);
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
.append(Math.round(rectangle2D.getY()))
.append(Math.round(rectangle2D.getWidth()))
.append(Math.round(rectangle2D.getHeight())));
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
}
}

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.llm.service.document.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class PositionOnPage {
// Each entry in this list corresponds to an entry in the redaction log, this means:
// A single entity might be represented by multiple redaction log entries
// This is due to the RedactionLog only being able to handle a single page per entry.
final String id;
Page page;
List<Rectangle2D> rectanglePerLine;
}

View File

@ -0,0 +1,248 @@
package com.knecon.fforesight.llm.service.document.entity;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class TextEntity implements IEntity {
// primary key
@EqualsAndHashCode.Include
final String id;
// primary key end
TextRange textRange;
@Builder.Default
List<TextRange> duplicateTextRanges = new ArrayList<>();
String type; // TODO: make final once ManualChangesApplicationService::recategorize is deleted
final EntityType entityType;
// inferred on graph insertion
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<PositionOnPage> positionsOnPagePerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, SemanticNode node) {
return TextEntity.builder().id(buildId(node, textRange, type, entityType)).type(type).entityType(entityType).textRange(textRange).build();
}
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id) {
return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build();
}
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id, String manualOverwriteSection) {
return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build();
}
private static String buildId(SemanticNode node, TextRange textRange, String type, EntityType entityType) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = node.getPositionsPerPage(textRange);
return IdBuilder.buildId(rectanglesPerLinePerPage.keySet(),
rectanglesPerLinePerPage.values()
.stream()
.flatMap(Collection::stream)
.toList(),
type,
entityType.name());
}
public void addTextRange(TextRange textRange) {
duplicateTextRanges.add(textRange);
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream()
.anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream()
.anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return type().equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type());
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public String getValueWithLineBreaks() {
return getDeepestFullyContainingNode().getTextBlock().subSequenceWithLineBreaks(getTextRange());
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
}
public List<PositionOnPage> getPositionsOnPagePerPage() {
if (positionsOnPagePerPage == null || positionsOnPagePerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
positionsOnPagePerPage = rectanglesPerLinePerPage.entrySet()
.stream()
.map(entry -> buildPositionOnPage(firstPage, id, entry))
.toList();
}
return positionsOnPagePerPage;
}
private static PositionOnPage buildPositionOnPage(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new PositionOnPage(id, entry.getKey(), entry.getValue());
} else {
return new PositionOnPage(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(TextEntity textEntity) {
return textEntity.contains(this);
}
public boolean contains(TextEntity textEntity) {
if (this.textRange.contains(textEntity.getTextRange())) {
return true;
}
List<TextRange> textEntityDuplicateRanges = textEntity.getDuplicateTextRanges();
// use optimized indexed loops for extra performance boost
for (int i = 0, duplicateTextRangesSize = duplicateTextRanges.size(); i < duplicateTextRangesSize; i++) {
TextRange duplicateTextRange = duplicateTextRanges.get(i);
if (duplicateTextRange.contains(textEntity.getTextRange())) {
return true;
}
for (int j = 0, textEntityDuplicateRangesSize = textEntityDuplicateRanges.size(); j < textEntityDuplicateRangesSize; j++) {
TextRange otherRange = textEntityDuplicateRanges.get(j);
if (duplicateTextRange.contains(otherRange)) {
return true;
}
}
}
return false;
}
public boolean intersects(TextEntity textEntity) {
return this.textRange.intersects(textEntity.getTextRange()) //
|| duplicateTextRanges.stream()
.anyMatch(duplicateTextRange -> duplicateTextRange.intersects(textEntity.textRange)) //
|| duplicateTextRanges.stream()
.anyMatch(duplicateTextRange -> textEntity.getDuplicateTextRanges()
.stream()
.anyMatch(duplicateTextRange::intersects));
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getPositionsOnPagePerPage().stream()
.anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(textRange);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type());
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
@Override
public String type() {
return getType();
}
}

View File

@ -0,0 +1,73 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
@EqualsAndHashCode.Include
List<Integer> treeId;
TextBlock textBlock;
DocumentTree documentTree;
@Builder.Default
Set<TextEntity> entities = new HashSet<>();
Map<Page, Rectangle2D> bBoxCache;
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -0,0 +1,171 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents the entire document as a node within the document's semantic structure.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Document extends AbstractSemanticNode {
Set<Page> pages;
Integer numberOfPages;
@Builder.Default
static final SectionIdentifier sectionIdentifier = SectionIdentifier.document();
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
/**
* Gets the sections of the document as a list.
*
* @return A list of all sections within the document.
*/
public List<Section> getAllSections() {
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
*
* @return A list of all children of type SECTION or SUPER_SECTION.
*/
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
.toList();
}
/**
* Streams all terminal (leaf) text blocks within the document in their natural order.
*
* @return A stream of terminal {@link TextBlock}.
*/
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
@Override
public SectionIdentifier getSectionIdentifier() {
return sectionIdentifier;
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElseGet(Headline::empty);
}
/**
* Streams all nodes within the document, regardless of type, in their natural order.
*
* @return A stream of all {@link SemanticNode} within the document.
*/
private Stream<SemanticNode> streamAllNodes() {
return getDocumentTree().allEntriesInOrder()
.map(DocumentTree.Entry::getNode);
}
/**
* Streams all image nodes contained within the document.
*
* @return A stream of {@link Image} nodes.
*/
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -0,0 +1,35 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Footer extends AbstractSemanticNode {
final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty();
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public SectionIdentifier getSectionIdentifier() {
return sectionIdentifier;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.llm.service.document.nodes;
public interface GenericSemanticNode extends SemanticNode {
}

View File

@ -0,0 +1,65 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents the header part of a document page.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Header extends AbstractSemanticNode {
final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty();
TextBlock leafTextBlock;
@Override
public boolean isLeaf() {
return true;
}
@Override
public NodeType getType() {
return NodeType.HEADER;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public SectionIdentifier getSectionIdentifier() {
return sectionIdentifier;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,100 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a headline in a document.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Headline extends AbstractSemanticNode {
TextBlock leafTextBlock;
SectionIdentifier sectionIdentifier;
@Override
public NodeType getType() {
return NodeType.HEADLINE;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@Override
public Headline getHeadline() {
return this;
}
@Override
public SectionIdentifier getSectionIdentifier() {
if (sectionIdentifier == null) {
sectionIdentifier = SectionIdentifier.fromSearchText(getTextBlock().getSearchText());
}
return sectionIdentifier;
}
/**
* Creates an empty headline with no text content.
*
* @return An empty {@link Headline} instance.
*/
public static Headline empty() {
return Headline.builder().leafTextBlock(AtomicTextBlock.empty(-1L, 0, new Page(), -1, null)).build();
}
/**
* Checks if this headline is associated with any paragraphs within its parent section or node.
*
* @return True if there are paragraphs associated with this headline, false otherwise.
*/
public boolean hasParagraphs() {
return getParent().streamAllSubNodesOfType(NodeType.PARAGRAPH)
.findFirst()
.isPresent();
}
}

View File

@ -0,0 +1,140 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.entity.IEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents an image within the document.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Image extends AbstractSemanticNode implements IEntity {
String id;
TextBlock leafTextBlock;
ImageType imageType;
boolean transparent;
Rectangle2D position;
Page page;
@Override
public NodeType getType() {
return NodeType.IMAGE;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public Set<Page> getPages() {
return Collections.singleton(page);
}
@Override
public TextRange getTextRange() {
return leafTextBlock.getTextRange();
}
@Override
public int length() {
return getTextRange().length();
}
@Override
public String type() {
return getType().toString().toLowerCase(Locale.ENGLISH);
}
@Override
public String toString() {
return getTreeId() + ": " + getValue() + " " + position;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
@Override
public String getValue() {
return NodeType.IMAGE + ":" + camelCase(imageType.toString());
}
private String camelCase(String name) {
return name.charAt(0) + name.substring(1).toLowerCase(Locale.ENGLISH);
}
public boolean mostlyContainedBy(Image image, double containmentThreshold) {
Map<Page, Rectangle2D> bboxImage = image.getBBox();
Map<Page, Rectangle2D> bbox = this.getBBox();
//image needs to be on the same page
if (bboxImage.get(this.page) != null) {
Rectangle2D intersection = bboxImage.get(this.page).createIntersection(bbox.get(this.page));
double calculatedIntersection = intersection.getWidth() * intersection.getHeight();
double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight();
return (calculatedIntersection / area) > containmentThreshold;
}
return false;
}
public boolean mostlyContains(Image image, double containmentThreshold) {
Map<Page, Rectangle2D> bboxImage = image.getBBox();
Map<Page, Rectangle2D> bbox = this.getBBox();
Rectangle2D intersection = bbox.get(this.page).createIntersection(bboxImage.get(this.page));
double calculatedIntersection = intersection.getWidth() * intersection.getHeight();
double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight();
return (area / calculatedIntersection) > containmentThreshold;
}
}

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.Locale;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
OTHER,
OCR,
GRAPHIC;
public static ImageType fromString(String imageType) {
return switch (imageType.toLowerCase(Locale.ROOT)) {
case "logo" -> ImageType.LOGO;
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
case "graphic" -> ImageType.GRAPHIC;
default -> ImageType.OTHER;
};
}
}

View File

@ -0,0 +1,22 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.Locale;
public enum NodeType {
DOCUMENT,
SECTION,
SUPER_SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ENGLISH);
}
}

View File

@ -0,0 +1,69 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
/**
* Represents a single page in a document.
*/
@Getter
@Setter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Page {
@EqualsAndHashCode.Include
Integer number;
Integer height;
Integer width;
Integer rotation;
List<SemanticNode> mainBody;
Header header;
Footer footer;
@Builder.Default
Set<TextEntity> entities = new HashSet<>();
@Builder.Default
Set<Image> images = new HashSet<>();
/**
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
*
* @return The main body text block.
*/
public TextBlock getMainBodyTextBlock() {
return mainBody.stream()
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
}

View File

@ -0,0 +1,54 @@
package com.knecon.fforesight.llm.service.document.nodes;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a paragraph in the document.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PROTECTED)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Paragraph extends AbstractSemanticNode {
TextBlock leafTextBlock;
@Override
public NodeType getType() {
return NodeType.PARAGRAPH;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
}
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.llm.service.document.nodes;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
/**
* Represents a section within a document, encapsulating both its textual content and semantic structure.
*/
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class Section extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SECTION;
}
/**
* Checks if this section contains any tables.
*
* @return True if the section contains at least one table, false otherwise.
*/
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public SectionIdentifier getSectionIdentifier() {
return getHeadline().getSectionIdentifier();
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string.
*
* @param value The string to search for within headlines, case-sensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsString(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
*
* @param value The string to search for within headlines, case-insensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
}
}

View File

@ -0,0 +1,158 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
* Represents a unique identifier for a section within a document.
*/
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format {
EMPTY,
NUMERICAL,
DOCUMENT
}
Format format;
String identifierString;
List<Integer> identifiers;
boolean asChild;
/**
* Generates a SectionIdentifier from the headline text of a section, determining its format and structure.
*
* @param headline The headline text from which to generate the section identifier.
* @return A {@link SectionIdentifier} instance corresponding to the headline text.
*/
public static SectionIdentifier fromSearchText(String headline) {
if (headline == null || headline.isEmpty() || headline.isBlank()) {
return SectionIdentifier.empty();
}
Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline);
if (numericalIdentifierMatcher.find()) {
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
}
// more formats here
return SectionIdentifier.empty();
}
/**
* Marks the current section identifier as a child of another section.
*
* @param sectionIdentifier The parent section identifier.
* @return A new {@link SectionIdentifier} instance marked as a child.
*/
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
}
/**
* Generates a SectionIdentifier that represents the entire document.
*
* @return A {@link SectionIdentifier} with a document-wide scope.
*/
public static SectionIdentifier document() {
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
}
/**
* Generates an empty SectionIdentifier.
*
* @return An empty {@link SectionIdentifier} instance.
*/
public static SectionIdentifier empty() {
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
}
private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) {
String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end());
List<Integer> identifiers = new LinkedList<>();
for (int i = 1; i <= 4; i++) {
String numericalIdentifier = numericalIdentifierMatcher.group(i);
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
break;
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
}
return new SectionIdentifier(Format.NUMERICAL,
identifierString,
identifiers.stream()
.toList(),
false);
}
/**
* Determines if the current section is the parent of the given section.
*
* @param sectionIdentifier The section identifier to compare against.
* @return true if the current section is the parent of the given section, false otherwise.
*/
public boolean isParentOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.EMPTY)) {
return false;
}
if (this.format.equals(Format.DOCUMENT)) {
return true;
}
if (!this.format.equals(sectionIdentifier.format)) {
return false;
}
if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) {
return false;
}
for (int i = 0; i < this.identifiers.size(); i++) {
if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) {
return false;
}
}
return true;
}
/**
* Determines if the current section is a child of the given section, based on their identifiers.
*
* @param sectionIdentifier The section identifier to compare against.
* @return True if the current section is a child of the given section, false otherwise.
*/
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {
return false;
}
return sectionIdentifier.isParentOf(this);
}
@Override
public String toString() {
return identifierString;
}
}

View File

@ -0,0 +1,672 @@
package com.knecon.fforesight.llm.service.document.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
public interface SemanticNode {
/**
* Returns the type of this node, such as Section, Paragraph, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
.collect(new TextBlockCollector());
}
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose TextRange intersects the TextRange of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<TextEntity> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages() {
return getTextBlock().getPages();
}
/**
* Finds the first page associated with this Node.
*
* @return Set of PageNodes this node appears on.
*/
default Page getFirstPage() {
return getTextBlock().getPages()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow();
}
/**
* Each AtomicTextBlock is assigned a page, so to get the pages for this TextRange, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages(TextRange textRange) {
if (!getTextRange().intersects(textRange)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not intersected in the %s of this node!", textRange, getTextRange()));
}
return getTextBlock().getPages(textRange);
}
/**
* Checks if the given page number exists in the list of pages.
*
* @param pageNumber the page number to be checked
* @return true if the page number exists, otherwise false
*/
default boolean onPage(int pageNumber) {
return getPages().stream()
.anyMatch(page -> page.getNumber() == pageNumber);
}
/**
* Returns the DocumentTree Object.
*
* @return the DocumentTree of the Document this node belongs to
*/
DocumentTree getDocumentTree();
/**
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
*
* @return the DocumentTree ID
*/
List<Integer> getTreeId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTreeId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
* If no Headline is found this way, it will recursively traverse the tree up and try again until it hits the root, where it will perform a BFS.
* If no Headline exists anywhere in the Document a dummy Headline is returned.
*
* @return First Headline found.
*/
default Headline getHeadline() {
return getParent().getHeadline();
}
/**
* Returns a SectionIdentifier, such that it acts as a child of the first Headline associated with this SemanticNode.
*
* @return The SectionIdentifier from the first Headline.
*/
default SectionIdentifier getSectionIdentifier() {
return SectionIdentifier.asChildOf(getHeadline().getSectionIdentifier());
}
/**
* Checks if its TreeId has a length greater than zero.
*
* @return boolean indicating whether this Node has a Parent in the DocumentTree
*/
default boolean hasParent() {
return getDocumentTree().hasParentById(getTreeId());
}
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
}
/**
* @return The SemanticNode which is directly underneath the document and also under which this node is.
* if this is the highest child node or the document itself, it returns itself.
*/
default SemanticNode getHighestParent() {
return getDocumentTree().getHighestParentById(getTreeId());
}
/**
* Returns the next sibling node of this SemanticNode in the document tree, if any.
* If there is no next sibling node, an empty Optional is returned.
*
* @return Optional containing the next sibling node, or empty if there is none
*/
default Optional<SemanticNode> getNextSibling() {
return getDocumentTree().getNextSibling(getTreeId());
}
/**
* Returns the previous sibling node of this SemanticNode in the document tree, if any.
* If there is no previous sibling node, an empty Optional is returned.
*
* @return Optional containing the previous sibling node, or empty if there is none
*/
default Optional<SemanticNode> getPreviousSibling() {
return getDocumentTree().getPreviousSibling(getTreeId());
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not leaves.
* A TableCell might be a leaf depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a TextBlock
*/
default boolean isLeaf() {
return false;
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are no leaves.
*
* @return AtomicTextBlock
*/
default TextBlock getLeafTextBlock() {
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
}
/**
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
*
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
*/
default void setLeafTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* Checks if the SemanticNode contains any text.
*
* @return true, if this node's TextBlock is not empty
*/
default boolean hasText() {
return !getTextBlock().isEmpty();
}
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
default boolean containsString(String string) {
return getTextBlock().getSearchText().contains(string);
}
Set<LayoutEngine> getEngines();
default void addEngine(LayoutEngine engine) {
getEngines().add(engine);
}
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
default boolean containsAllStrings(String... strings) {
return Arrays.stream(strings)
.allMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings to check if they are contained in the TextBlock
* @return true, if this node's TextBlock contains any of the provided strings
*/
default boolean containsAnyString(String... strings) {
return Arrays.stream(strings)
.anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream()
.anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains all the provided Strings case-insensitive.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string case-insensitive
*/
default boolean containsStringIgnoreCase(String string) {
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyStringIgnoreCase(String... strings) {
return Arrays.stream(strings)
.anyMatch(this::containsStringIgnoreCase);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAllStringsIgnoreCase(String... strings) {
return Arrays.stream(strings)
.allMatch(this::containsStringIgnoreCase);
}
/**
* Checks whether this SemanticNode contains exactly the provided String as a word.
*
* @param word - String which the TextBlock might contain
* @return true, if this node's TextBlock contains string
*/
default boolean containsWord(String word) {
return getTextBlock().getWords()
.stream()
.anyMatch(s -> s.equals(word));
}
/**
* Checks whether this SemanticNode contains exactly the provided String as a word case-insensitive.
*
* @param word - String which the TextBlock might contain
* @return true, if this node's TextBlock contains string
*/
default boolean containsWordIgnoreCase(String word) {
return getTextBlock().getWords()
.stream()
.map(String::toLowerCase)
.anyMatch(s -> s.equals(word.toLowerCase(Locale.ENGLISH)));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings as a word.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the provided strings
*/
default boolean containsAnyWord(String... words) {
return Arrays.stream(words)
.anyMatch(word -> getTextBlock().getWords()
.stream()
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings as a word case-insensitive.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the provided strings
*/
default boolean containsAnyWordIgnoreCase(String... words) {
return Arrays.stream(words)
.map(String::toLowerCase)
.anyMatch(word -> getTextBlock().getWords()
.stream()
.map(String::toLowerCase)
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode contains all the provided Strings as word.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all the provided strings
*/
default boolean containsAllWords(String... words) {
return Arrays.stream(words)
.allMatch(word -> getTextBlock().getWords()
.stream()
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode contains all the provided Strings as word case-insensitive.
*
* @param words - A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all the provided strings
*/
default boolean containsAllWordsIgnoreCase(String... words) {
return Arrays.stream(words)
.map(String::toLowerCase)
.allMatch(word -> getTextBlock().getWords()
.stream()
.map(String::toLowerCase)
.anyMatch(word::equals));
}
/**
* Checks whether this SemanticNode intersects the provided rectangle.
*
* @param x the lower left corner X value
* @param y the lower left corner Y value
* @param w width
* @param h height
* @param pageNumber the pageNumber of the rectangle
* @return true if intersects, false otherwise
*/
default boolean intersectsRectangle(int x, int y, int w, int h, int pageNumber) {
return getBBox().entrySet()
.stream()
.filter(entry -> entry.getKey().getNumber() == pageNumber)
.map(Map.Entry::getValue)
.anyMatch(rect -> rect.intersects(x, y, w, h));
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the TextRange of the RedactionEntity intersects or even contains the RedactionEntity.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param textEntity RedactionEntity, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(TextEntity textEntity) {
TextBlock textBlock = getTextBlock();
if (textBlock.getTextRange().intersects(textEntity.getTextRange())) {
if (textBlock.containsTextRange(textEntity.getTextRange())) {
textEntity.setDeepestFullyContainingNode(this);
}
textEntity.addIntersectingNode(this);
getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange())
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
}
}
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getDocumentTree().childNodes(getTreeId());
}
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
}
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.map(DocumentTree.Entry::getNode);
}
/**
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.filter(entry -> entry.getType().equals(nodeType))
.map(DocumentTree.Entry::getNode);
}
/**
* The TextRange is the start and end string offsets in the reading order of the document.
*
* @return TextRange of this Node's TextBlock
*/
default TextRange getTextRange() {
return getTextBlock().getTextRange();
}
/**
* Returns the length of the text content in this Node's TextBlock.
*
* @return The length of the text content
*/
default int length() {
return getTextRange().length();
}
/**
* For a given TextRange this function returns a List of rectangle around the text in the range.
* These Rectangles are split either by a new line or by a large gap in the current line.
* This is mainly used to find the positions of TextEntities
*
* @param textRange A TextRange to calculate the positions for.
* @return A Map, where the keys are the pages and the values are a list of rectangles describing the position of words
*/
default Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange textRange) {
if (isLeaf()) {
return getTextBlock().getPositionsPerPage(textRange);
}
Optional<SemanticNode> containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange);
if (containingChildNode.isEmpty()) {
return getTextBlock().getPositionsPerPage(textRange);
}
return containingChildNode.get().getPositionsPerPage(textRange);
}
/**
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the Document, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<Page, Rectangle2D> getBBox() {
if (isLeaf()) {
return getBBoxFromLeafTextBlock();
}
return getBBoxFromChildren();
}
/**
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
*
* @param rectangle2D The rectangle to check if it is contained
* @param pageNumber The Page number on which the rectangle should be checked
* @return boolean
*/
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
Page helperPage = Page.builder().number(pageNumber).build();
if (!getPages().contains(helperPage)) {
return false;
}
return getBBox().get(helperPage).contains(rectangle2D);
}
/**
* TODO: this produces unwanted results for sections spanning multiple columns.
* Computes the Union of the bounding boxes of all children recursively.
*
* @return The union of the BoundingBoxes of all children
*/
private Map<Page, Rectangle2D> getBBoxFromChildren() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox)
.toList();
Set<Page> pages = childrenBBoxes.stream()
.flatMap(map -> map.keySet()
.stream())
.collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
}
return bBoxPerPage;
}
/**
* @return The union of all BoundingBoxes of the TextBlock of this node
*/
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
.stream()
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atomicTextBlocks) -> bBoxPerPage.put(page, RectangleTransformations.atomicTextBlockBBox(atomicTextBlocks)));
return bBoxPerPage;
}
}

View File

@ -0,0 +1,89 @@
package com.knecon.fforesight.llm.service.document.nodes;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
/**
* Represents a section within a document, encapsulating both its textual content and semantic structure.
*/
@Slf4j
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class SuperSection extends AbstractSemanticNode {
@Override
public NodeType getType() {
return NodeType.SUPER_SECTION;
}
/**
* Checks if this section contains any tables.
*
* @return True if the section contains at least one table, false otherwise.
*/
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public SectionIdentifier getSectionIdentifier() {
return getHeadline().getSectionIdentifier();
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string.
*
* @param value The string to search for within headlines, case-sensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsString(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
}
/**
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
*
* @param value The string to search for within headlines, case-insensitive.
* @return True if at least one headline contains the specified string, false otherwise.
*/
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
}
}

View File

@ -0,0 +1,306 @@
package com.knecon.fforesight.llm.service.document.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
/**
* Represents a table within a document.
*/
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Table implements SemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
@EqualsAndHashCode.Include
List<Integer> treeId;
DocumentTree documentTree;
int numberOfRows;
int numberOfCols;
TextBlock textBlock;
@Builder.Default
Set<TextEntity> entities = new HashSet<>();
Map<Page, Rectangle2D> bBoxCache;
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = SemanticNode.super.getBBox();
}
return bBoxCache;
}
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
public Stream<TextEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
return IntStream.range(0, numberOfRows).boxed()
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
String rowText = streamRow(row).map(TableCell::getTextBlock)
.collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
return strings.stream()
.map(String::toLowerCase)
.allMatch(rowText::contains);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Returns a TableCell at the provided row and column location.
*
* @param row int representing the row, must be smaller than numberOfRows
* @param col int representing the col, must be smaller than numberOfCols
* @return TableCell at the provided location in the table
*/
public TableCell getCell(int row, int col) {
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
}
int idx = row * numberOfCols + col;
return (TableCell) documentTree.getEntryById(treeId).getChildren().get(idx).getNode();
}
/**
* Streams all TableCells in this Table row-wise.
*
* @return Stream of all TableCells
*/
public Stream<TableCell> streamTableCells() {
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
}
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
public Stream<TableCell> streamTableCellsWithHeader(String header) {
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
.map(TableCell::getCol)
.flatMap(this::streamCol)
.filter(tableCellNode -> !tableCellNode.isHeader());
}
/**
* Streams all TableCells belonging to the provided column from top down.
*
* @param col int representing the column
* @return Stream of all TableCell in the provided column
*/
public Stream<TableCell> streamCol(int col) {
return IntStream.range(0, numberOfRows).boxed()
.map(row -> getCell(row, col));
}
/**
* Streams all TableCells belonging to the provided row from left to right.
*
* @param row int representing the row
* @return Stream of all TableCell in the provided row
*/
public Stream<TableCell> streamRow(int row) {
return IntStream.range(0, numberOfCols).boxed()
.map(col -> getCell(row, col));
}
/**
* Streams all TableCells row-wise and filters them with header == true.
*
* @return Stream of all TableCells with header == true
*/
public Stream<TableCell> streamHeaders() {
return streamTableCells().filter(TableCell::isHeader);
}
/**
* Streams all TableCells of the provided row and column and filters them with header == true.
*
* @param row int representing the row
* @param col int representing the column
* @return Stream of all TableCells with header == true in the provided row or col
*/
public Stream<TableCell> streamHeadersForCell(int row, int col) {
return Stream.concat(streamRow(row), streamCol(col))
.filter(TableCell::isHeader);
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeader(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeaderIgnoreCase(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock()
.getSearchText()
.strip()
.toLowerCase(Locale.ENGLISH)
.equals(header.toLowerCase(Locale.ENGLISH)));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
public boolean hasRowWithHeaderAndValue(String header, String value) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*
* @param header string to find header cells
* @param values List of strings to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*/
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
}
@Override
public NodeType getType() {
return NodeType.TABLE;
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = SemanticNode.super.getTextBlock();
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.llm.service.document.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.Map;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
/**
* Represents a single table cell within a table.
*/
@Data
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
public class TableCell extends AbstractSemanticNode {
int row;
int col;
boolean header;
Rectangle2D bBox;
TextBlock leafTextBlock;
TextBlock textBlock;
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public NodeType getType() {
return NodeType.TABLE_CELL;
}
@Override
public boolean isLeaf() {
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (isLeaf()) {
return leafTextBlock;
}
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -0,0 +1,257 @@
package com.knecon.fforesight.llm.service.document.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
Page page;
//string coordinates
TextRange textRange;
String searchText;
List<String> words;
List<Integer> lineBreaks;
//position coordinates
List<Integer> stringIdxToPositionIdx;
@Getter
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
return AtomicTextBlock.builder()
.id(textBlockIdx)
.textRange(new TextRange(stringOffset, stringOffset))
.searchText("")
.lineBreaks(Collections.emptyList())
.page(page)
.numberOnPage(numberOnPage)
.stringIdxToPositionIdx(Collections.emptyList())
.positions(Collections.emptyList())
.parent(parent)
.build();
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData atomicTextBlockData, DocumentPositionData atomicPositionBlockData, SemanticNode parent, Page page) {
return AtomicTextBlock.builder()
.id(atomicTextBlockData.getId())
.numberOnPage(atomicTextBlockData.getNumberOnPage())
.page(page)
.textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
.searchText(atomicTextBlockData.getSearchText())
.lineBreaks(Arrays.stream(atomicTextBlockData.getLineBreaks()).boxed()
.toList())
.stringIdxToPositionIdx(Arrays.stream(atomicPositionBlockData.getStringIdxToPositionIdx()).boxed()
.toList())
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions)
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
.toList();
}
public TextRange getLineTextRange(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
return new TextRange(textRange.start(), textRange.start());
}
if (numberOfLines() == 1) {
return textRange;
}
if (lineNumber == 0) {
return new TextRange(textRange.start(), lineBreaks.get(0) + textRange.start());
} else if (lineNumber == numberOfLines() - 1) {
return new TextRange(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
}
return new TextRange(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
}
public List<String> getWords() {
if (words == null) {
words = new ArrayList<>();
BreakIterator iterator = BreakIterator.getWordInstance(Locale.ENGLISH);
iterator.setText(searchText);
int start = iterator.first();
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
words.add(searchText.substring(start, end));
}
}
return words;
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
.findFirst() //
.orElse(searchText.length()) + textRange.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
.reduce((a, b) -> b)//
.orElse(0) + textRange.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
if (!containsTextRange(stringTextRange)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
}
if (stringTextRange.length() == 0) {
return Collections.emptyList();
}
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
if (stringTextRange.end() == this.textRange.end()) {
return positions.subList(startPositionIdx, positions.size());
}
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
}
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleBBoxWithGaps)
.flatMap(Collection::stream)
.toList();
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
rectanglePerLinePerPage.put(page, rectanglesPerLine);
return rectanglePerLinePerPage;
}
@Override
public String subSequenceWithLineBreaks(TextRange textRange) {
if (textRange.length() == 0 || !getTextRange().contains(textRange)) {
return "";
}
Set<Integer> lbInBoundary = lineBreaks.stream()
.map(i -> i + textRange.start())
.filter(textRange::contains)
.collect(Collectors.toSet());
if (textRange.end() == getTextRange().end()) {
lbInBoundary.add(getTextRange().end());
}
StringBuilder sb = new StringBuilder();
for (int i = textRange.start(); i < textRange.end(); i++) {
char character = this.charAt(i);
if (lbInBoundary.contains(i + 1)) {
// always plus one, due to the linebreaks being an exclusive end index
if (!Character.isWhitespace(character)) {
lbInBoundary.remove(i + 1);
lbInBoundary.add(i + 2);
sb.append(character);
continue;
}
sb.append("\n");
} else {
sb.append(character);
}
}
return sb.toString();
}
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream()
.map(linebreak -> linebreak + this.textRange.start())
.filter(textRange::contains)
.toList();
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -0,0 +1,268 @@
package com.knecon.fforesight.llm.service.document.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
TextRange textRange;
public static ConcatenatedTextBlock empty() {
return new ConcatenatedTextBlock(Collections.emptyList());
}
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
textRange = new TextRange(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size())
.forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
if (this.atomicTextBlocks.isEmpty()) {
textRange.setStart(textBlock.getTextRange().start());
textRange.setEnd(textBlock.getTextRange().end());
} else if (textRange.end() != textBlock.getTextRange().start()) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
textRange.setEnd(textBlock.getTextRange().end());
this.searchText = null;
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream()
.filter(textBlock -> textBlock.getTextRange().contains(stringIdx))
.findAny()
.orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
List<AtomicTextBlock> intersectingAtomicTextBlocks = new LinkedList<>();
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
if (atomicTextBlock.getTextRange().start() > textRange.end()) {
break; // early stop, following TextBlocks will never intersect
}
if (atomicTextBlock.getTextRange().intersects(textRange)) {
intersectingAtomicTextBlocks.add(atomicTextBlock);
}
}
return intersectingAtomicTextBlocks;
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public List<String> getWords() {
return atomicTextBlocks.stream()
.map(AtomicTextBlock::getWords)
.flatMap(Collection::stream)
.toList();
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream()
.mapToInt(AtomicTextBlock::numberOfLines).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream()
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
.stream())
.toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
public TextRange getLineTextRange(int lineNumber) {
if (atomicTextBlocks.size() == 1) {
return atomicTextBlocks.get(0).getLineTextRange(lineNumber);
}
int lineNumberInCurrentBlock = lineNumber;
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
if (lineNumberInCurrentBlock < atomicTextBlock.numberOfLines()) {
return atomicTextBlock.getLineTextRange(lineNumberInCurrentBlock);
}
lineNumberInCurrentBlock -= atomicTextBlock.numberOfLines();
}
return new TextRange(textRange.start(), textRange.start());
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.isEmpty()) {
return Collections.emptyList();
}
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return positions;
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.isEmpty()) {
return new HashMap<>();
}
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
stringTextRange.end())));
return rectanglesPerLinePerPage;
}
@Override
public String subSequenceWithLineBreaks(TextRange textRange) {
if (textRange.length() == 0 || !getTextRange().contains(textRange)) {
return "";
}
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(textRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).subSequenceWithLineBreaks(textRange);
}
StringBuilder sb = new StringBuilder();
AtomicTextBlock firstTextBlock = textBlocks.get(0);
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(textRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
sb.append(textBlock.searchTextWithLineBreaks());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), textRange.end())));
return sb.toString();
}
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
rectangles,
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
.toList()));
return mergedMap;
}
@Override
public String toString() {
return getSearchText();
}
}

View File

@ -0,0 +1,176 @@
package com.knecon.fforesight.llm.service.document.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
import com.knecon.fforesight.llm.service.document.TextRange;
import com.knecon.fforesight.llm.service.document.nodes.Page;
public interface TextBlock extends CharSequence {
String getSearchText();
List<String> getWords();
List<AtomicTextBlock> getAtomicTextBlocks();
TextRange getTextRange();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
TextRange getLineTextRange(int lineNumber);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(TextRange stringTextRange);
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
String subSequenceWithLineBreaks(TextRange textRange);
int numberOfLines();
default CharSequence getLine(int lineNumber) {
return subSequence(getLineTextRange(lineNumber));
}
default List<Rectangle2D> getLinePositions(int lineNumber) {
return getPositions(getLineTextRange(lineNumber));
}
default Rectangle2D getLineBBox(int lineNumber) {
return RectangleTransformations.rectangle2DBBox(getLinePositions(lineNumber));
}
default String searchTextWithLineBreaks() {
return subSequenceWithLineBreaks(getTextRange());
}
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getTextRange().start());
}
default Set<Page> getPages() {
return getAtomicTextBlocks().stream()
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default Set<Page> getPages(TextRange textRange) {
return getAtomicTextBlocks().stream()
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
if (start == -1) {
return -1;
}
return start + getTextRange().start();
}
default CharSequence getFirstLine() {
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
}
default boolean containsTextRange(TextRange textRange) {
if (textRange.end() < textRange.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
}
return getTextRange().contains(textRange);
}
default boolean containsIndex(int stringIndex) {
return getTextRange().contains(stringIndex);
}
default CharSequence subSequence(TextRange textRange) {
return subSequence(textRange.start(), textRange.end());
}
default String buildSummary() {
String searchText = getSearchText();
// substring, as splitting very large strings gets expensive
searchText = searchText.substring(0, Math.min(searchText.length(), 200));
String[] words = searchText.split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
}
@Override
default int length() {
return getTextRange().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getTextRange().start());
}
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.llm.service.document.textblock;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return ConcatenatedTextBlock::empty;
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -0,0 +1,264 @@
package com.knecon.fforesight.llm.service.services;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.azure.ai.openai.models.ChatChoice;
import com.azure.ai.openai.models.ChatCompletions;
import com.azure.ai.openai.models.ChatCompletionsOptions;
import com.azure.ai.openai.models.ChatRequestMessage;
import com.azure.ai.openai.models.ChatRequestSystemMessage;
import com.azure.ai.openai.models.ChatRequestUserMessage;
import com.azure.ai.openai.models.CompletionsUsage;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.llm.service.ChunkingResponse;
import com.knecon.fforesight.llm.service.ChunkingResponseData;
import com.knecon.fforesight.llm.service.LlmNerEntities;
import com.knecon.fforesight.llm.service.LlmNerEntity;
import com.knecon.fforesight.llm.service.LlmNerMessage;
import com.knecon.fforesight.llm.service.SystemMessages;
import com.knecon.fforesight.llm.service.document.ConsecutiveTextBlockCollector;
import com.knecon.fforesight.llm.service.document.DocumentData;
import com.knecon.fforesight.llm.service.document.DocumentGraphMapper;
import com.knecon.fforesight.llm.service.document.DocumentTree;
import com.knecon.fforesight.llm.service.document.nodes.Document;
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
import com.knecon.fforesight.llm.service.utils.FormattingUtils;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LlmNerService {
public static final String JSON_PREFIX = "```json";
public static final String JSON_PREFIX2 = "```";
public static final String SUFFIX = "```";
StorageService storageService;
LlmRessource llmRessource;
ObjectMapper mapper;
@SneakyThrows
public Usage runNer(LlmNerMessage llmNerMessage) {
int completionTokenCount = 0;
int promptTokenCount = 0;
long start = System.currentTimeMillis();
Document document = buildDocument(llmNerMessage);
ChunkingResponse chunks = readChunks(llmNerMessage.getChunksStorageId());
List<LlmNerEntity> allEntities = Collections.synchronizedList(new LinkedList<>());
List<CompletableFuture<EntitiesWithUsage>> entityFutures = chunks.getData()
.stream()
.map(chunk -> getLlmNerEntitiesFuture(chunk, document))
.toList();
for (CompletableFuture<EntitiesWithUsage> entityFuture : entityFutures) {
EntitiesWithUsage entitiesWithUsage = entityFuture.get();
allEntities.addAll(entitiesWithUsage.entities());
completionTokenCount += entitiesWithUsage.completionsUsage().getCompletionTokens();
promptTokenCount += entitiesWithUsage.completionsUsage().getPromptTokens();
}
storageService.storeJSONObject(TenantContext.getTenantId(), llmNerMessage.getResultStorageId(), new LlmNerEntities(allEntities));
long duration = System.currentTimeMillis() - start;
log.info("Found {} named entities for {} in {} with {} prompt tokens and {} completion tokens.",
allEntities.size(),
llmNerMessage.getIdentifier(),
FormattingUtils.humanizeDuration(duration),
promptTokenCount,
completionTokenCount);
return new Usage(completionTokenCount, promptTokenCount, duration);
}
private CompletableFuture<EntitiesWithUsage> getLlmNerEntitiesFuture(ChunkingResponseData chunk, Document document) {
return CompletableFuture.supplyAsync(() -> getLlmNerEntities(chunk, document));
}
@SneakyThrows
private EntitiesWithUsage getLlmNerEntities(ChunkingResponseData chunk, Document document) {
log.debug("Sending request with text of length {}", chunk.getText().length());
long start = System.currentTimeMillis();
ChatCompletions chatCompletions = runNer(chunk.getText());
log.debug("Got response back, used {} prompt tokens, {} completion tokens, took {}",
chatCompletions.getUsage().getPromptTokens(),
chatCompletions.getUsage().getCompletionTokens(),
FormattingUtils.humanizeDuration(System.currentTimeMillis() - start));
return mapEntitiesToDocument(chatCompletions, getChunkParts(document, chunk.getTreeIds()), document);
}
public ChatCompletions runNer(String text) throws InterruptedException {
List<ChatRequestMessage> chatMessages = new ArrayList<>();
chatMessages.add(new ChatRequestSystemMessage(SystemMessages.NER));
chatMessages.add(new ChatRequestUserMessage(text));
ChatCompletionsOptions options = new ChatCompletionsOptions(chatMessages);
options.setTemperature(0.0);
return llmRessource.getChatCompletions(options);
}
private List<TextBlock> getChunkParts(Document document, List<List<Integer>> treeIds) {
return treeIds.stream()
.map(treeId -> document.getDocumentTree().getEntryById(treeId))
.map(DocumentTree.Entry::getNode)
.map(SemanticNode::getTextBlock)
.collect(new ConsecutiveTextBlockCollector());
}
private EntitiesWithUsage mapEntitiesToDocument(ChatCompletions chatCompletions, List<TextBlock> chunkParts, Document document) {
EntitiesWithUsage allEntities = new EntitiesWithUsage(new LinkedList<>(), chatCompletions.getUsage());
for (ChatChoice choice : chatCompletions.getChoices()) {
String response = parseResponse(choice);
if (response == null) {
continue;
}
try {
Map<String, List<String>> entitiesPerType = mapper.readValue(response, new TypeReference<Map<String, List<String>>>() {
});
List<LlmNerEntity> entitiesFromResponse = entitiesPerType.entrySet()
.stream()
.flatMap(entitiesWithType -> entitiesWithType.getValue()
.stream()
.distinct()
.flatMap(entity -> findInChunks(entity, chunkParts, entitiesWithType.getKey(), document)))
.toList();
allEntities.entities().addAll(entitiesFromResponse);
} catch (JsonProcessingException e) {
logMalformedResponse(response);
log.error(e.getMessage());
}
}
return allEntities;
}
private static String parseResponse(ChatChoice choice) {
String response = choice.getMessage().getContent();
if (response.startsWith(JSON_PREFIX)) {
response = response.substring(JSON_PREFIX.length());
} else if (response.startsWith(JSON_PREFIX2)) {
response = response.substring(JSON_PREFIX2.length());
} else {
logMalformedResponse(response);
return null;
}
if (response.endsWith(SUFFIX)) {
response = response.substring(0, response.length() - SUFFIX.length());
} else {
logMalformedResponse(response);
return null;
}
return response;
}
private Stream<LlmNerEntity> findInChunks(String entity, List<TextBlock> chunkParts, String type, Document document) {
Pattern entityPattern = Pattern.compile(String.format("(?:\\b|\\s)(%s)(?:\\b|\\s)", Pattern.quote(entity)));
for (TextBlock chunkPart : chunkParts) {
String searchText = chunkPart.getSearchText();
Matcher matcher = entityPattern.matcher(searchText);
List<LlmNerEntity> entitiesInCurrentChunk = matcher.results()
.map(matchResult -> new LlmNerEntity(entity,
type,
matchResult.start(1) + chunkPart.getTextRange().start(),
matchResult.end(1) + chunkPart.getTextRange().start()))
.toList();
if (!entitiesInCurrentChunk.stream()
.allMatch(nerEntity -> document.getTextBlock().subSequence(nerEntity.getStartOffset(), nerEntity.getEndOffset()).equals(nerEntity.getValue()))) {
log.error("Entities have wrong value, expected {}, actual {}",
entity,
entitiesInCurrentChunk.stream()
.map(LlmNerEntity::getValue)
.collect(Collectors.joining(", ")));
throw new AssertionError();
}
if (!entitiesInCurrentChunk.isEmpty()) {
if (entitiesInCurrentChunk.size() > 1) {
log.debug("Multiple entities found for {}, returning all occurrences", entity);
}
return entitiesInCurrentChunk.stream();
}
}
log.debug("Could not find entity {} in any of the chunks", entity);
return Stream.empty();
}
private static void logMalformedResponse(String response) {
log.error("Response could not be parsed as JSON, response is {}", response);
}
private ChunkingResponse readChunks(String chunksStorageId) {
return storageService.readJSONObject(TenantContext.getTenantId(), chunksStorageId, ChunkingResponse.class);
}
private Document buildDocument(LlmNerMessage llmNerMessage) {
DocumentData documentData = new DocumentData();
documentData.setDocumentStructure(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentStructureStorageId(), DocumentStructure.class));
documentData.setDocumentTextData(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentTextStorageId(), DocumentTextData[].class));
documentData.setDocumentPositionData(storageService.readJSONObject(TenantContext.getTenantId(),
llmNerMessage.getDocumentPositionStorageId(),
DocumentPositionData[].class));
documentData.setDocumentPages(storageService.readJSONObject(TenantContext.getTenantId(), llmNerMessage.getDocumentPagesStorageId(), DocumentPage[].class));
return DocumentGraphMapper.toDocumentGraph(documentData);
}
private record EntitiesWithUsage(List<LlmNerEntity> entities, CompletionsUsage completionsUsage) {
}
public record Usage(int completionTokenCount, int promptTokenCount, long durationMillis) {
}
}

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.llm.service.services;
import java.util.concurrent.Semaphore;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.azure.ai.openai.OpenAIAsyncClient;
import com.azure.ai.openai.OpenAIClient;
import com.azure.ai.openai.OpenAIClientBuilder;
import com.azure.ai.openai.models.ChatCompletions;
import com.azure.ai.openai.models.ChatCompletionsOptions;
import com.azure.core.credential.AzureKeyCredential;
import com.knecon.fforesight.llm.service.LlmServiceSettings;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import reactor.core.publisher.Flux;
@Slf4j
@Service
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LlmRessource {
OpenAIAsyncClient asyncClient;
OpenAIClient client;
LlmServiceSettings settings;
Semaphore concurrencyLimitingSemaphore;
public LlmRessource(@Value("${llm-service.azureOpenAiEndpoint}") String azureEndpoint, @Value("${llm-service.azureOpenAiKey}") String azureKey, LlmServiceSettings settings) {
this.settings = settings;
this.concurrencyLimitingSemaphore = new Semaphore(settings.getConcurrency());
this.asyncClient = new OpenAIClientBuilder().credential(new AzureKeyCredential(azureKey)).endpoint(azureEndpoint).buildAsyncClient();
this.client = new OpenAIClientBuilder().credential(new AzureKeyCredential(azureKey)).endpoint(azureEndpoint).buildClient();
}
public Flux<ChatCompletions> getChatCompletionsFlux(ChatCompletionsOptions options) {
options.setStream(true);
return asyncClient.getChatCompletionsStream(settings.getModel(), options);
}
public ChatCompletions getChatCompletions(ChatCompletionsOptions options) throws InterruptedException {
concurrencyLimitingSemaphore.acquire();
ChatCompletions chatCompletions = client.getChatCompletions(settings.getModel(), options);
concurrencyLimitingSemaphore.release();
return chatCompletions;
}
public int getCurrentConcurrency() {
return settings.getConcurrency() - concurrencyLimitingSemaphore.availablePermits();
}
}

View File

@ -0,0 +1,53 @@
package com.knecon.fforesight.llm.service.services;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.azure.ai.openai.models.ChatCompletions;
import com.azure.ai.openai.models.ChatCompletionsOptions;
import com.azure.ai.openai.models.ChatRequestMessage;
import com.azure.ai.openai.models.ChatRequestSystemMessage;
import com.azure.ai.openai.models.ChatRequestUserMessage;
import com.knecon.fforesight.llm.service.ChatEvent;
import com.knecon.fforesight.llm.service.SystemMessages;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import reactor.core.publisher.Flux;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LlmService {
WebSocketMessagingTemplate websocketTemplate;
LlmRessource llmRessource;
@SneakyThrows
public void rulesCopilot(List<String> prompt, String userId) {
List<ChatRequestMessage> chatMessages = new ArrayList<>();
chatMessages.add(new ChatRequestSystemMessage(SystemMessages.RULES_CO_PILOT));
chatMessages.addAll(prompt.stream()
.map(ChatRequestUserMessage::new)
.toList());
ChatCompletionsOptions options = new ChatCompletionsOptions(chatMessages);
Flux<ChatCompletions> chatCompletions = llmRessource.getChatCompletionsFlux(options);
chatCompletions.subscribe(chatCompletion -> sendRulesCopilotEvent(userId, chatCompletion.getChoices().get(0).getDelta().getContent()));
}
private void sendRulesCopilotEvent(String userId, String token) {
websocketTemplate.sendEvent(userId, "/queue/" + TenantContext.getTenantId() + "/rules-copilot", new ChatEvent(token));
}
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.llm.service.services;
public interface WebSocketMessagingTemplate {
void sendEvent(String userId, String token, Object payload);
}

View File

@ -0,0 +1,29 @@
package com.knecon.fforesight.llm.service.utils;
import lombok.experimental.UtilityClass;
@UtilityClass
public class FormattingUtils {
public String humanizeDuration(long duration) {
if (duration < 1000) {
return duration + " ms";
} else if (duration < 60 * 1000) {
double seconds = duration / 1000.0;
return String.format("%.1f s", seconds);
} else if (duration < 60 * 60 * 1000) {
long minutes = duration / (60 * 1000);
long remainingMillis = duration % (60 * 1000);
double seconds = remainingMillis / 1000.0;
return String.format("%d:%.1f m", minutes, seconds);
} else {
long hours = duration / (60 * 60 * 1000);
long remainingMillis = duration % (60 * 60 * 1000);
long minutes = remainingMillis / (60 * 1000);
remainingMillis = remainingMillis % (60 * 1000);
double seconds = remainingMillis / 1000.0;
return String.format("%d:%d:%.1f h", hours, minutes, seconds);
}
}
}

View File

@ -0,0 +1,69 @@
import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
plugins {
application
id("com.knecon.fforesight.service.java-conventions")
id("org.springframework.boot") version "3.2.3"
id("io.spring.dependency-management") version "1.1.3"
id("org.sonarqube") version "4.3.0.3225"
id("io.freefair.lombok") version "8.4"
}
configurations {
all {
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
exclude(group = "commons-logging", module = "commons-logging")
}
}
val springBootVersion = "3.1.1"
val springCloudVersion = "2022.0.5"
val springSecurityVersion = "6.1.3"
val testcontainersVersion = "1.20.0"
dependencies {
implementation(project(":llm-service-api"))
implementation(project(":llm-service-processor"))
implementation("org.springframework.boot:spring-boot-starter-actuator:$springBootVersion")
implementation("org.springframework.boot:spring-boot-starter-amqp:$springBootVersion")
implementation("org.springframework.boot:spring-boot-starter-web:$springBootVersion")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.3")
implementation("org.springframework.boot:spring-boot-starter-websocket:$springBootVersion")
implementation("org.springframework.security:spring-security-messaging:$springSecurityVersion")
implementation("com.iqser.red.commons:storage-commons:2.49.0")
implementation("com.knecon.fforesight:keycloak-commons:0.29.0")
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
implementation("ch.qos.logback:logback-classic")
developmentOnly("org.springframework.boot:spring-boot-devtools:$springBootVersion")
annotationProcessor("org.springframework.boot:spring-boot-configuration-processor:$springBootVersion")
testImplementation("org.springframework.boot:spring-boot-starter-test:$springBootVersion")
testImplementation("org.springframework.amqp:spring-rabbit-test:$springBootVersion")
}
tasks.named<BootBuildImage>("bootBuildImage") {
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
if (project.hasProperty("buildbootDockerHostNetwork")) {
network.set("host")
}
docker {
if (project.hasProperty("buildbootDockerHostNetwork")) {
bindHostToBuilder.set(true)
}
verboseLogging.set(true)
publishRegistry {
username.set(providers.gradleProperty("mavenUser").getOrNull())
password.set(providers.gradleProperty("mavenPassword").getOrNull())
email.set(providers.gradleProperty("mavenEmail").getOrNull())
url.set("https://nexus.knecon.com:5001/")
}
}
}

View File

@ -17,7 +17,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@EnableWebMvc
@EnableAsync
@Import({StorageAutoConfiguration.class})
@Import({StorageAutoConfiguration.class, LlmServiceConfiguration.class})
@ImportAutoConfiguration({StorageAutoConfiguration.class, MultiTenancyAutoConfiguration.class, SpringDocAutoConfiguration.class, DefaultKeyCloakCommonsAutoConfiguration.class})
@SpringBootApplication
public class Application {

View File

@ -5,7 +5,7 @@ import org.springframework.web.bind.annotation.ExceptionHandler;
import org.springframework.web.bind.annotation.RestControllerAdvice;
import org.springframework.web.server.ResponseStatusException;
import com.knecon.fforesight.llm.service.api.ErrorMessage;
import com.knecon.fforesight.llm.service.ErrorMessage;
@RestControllerAdvice
public class ControllerAdvice {

View File

@ -6,7 +6,7 @@ import org.springframework.messaging.handler.annotation.MessageMapping;
import org.springframework.messaging.handler.annotation.Payload;
import org.springframework.stereotype.Controller;
import com.knecon.fforesight.llm.service.api.model.PromptList;
import com.knecon.fforesight.llm.service.PromptList;
import com.knecon.fforesight.llm.service.services.LlmService;
import lombok.RequiredArgsConstructor;

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.llm.service.queue;
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_QUEUE;
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_RESPONSE_QUEUE;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.core.Message;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.llm.service.LlmNerMessage;
import com.knecon.fforesight.llm.service.LlmNerResponseMessage;
import com.knecon.fforesight.llm.service.services.LlmNerService;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class MessageHandler {
LlmNerService llmNerService;
ObjectMapper mapper;
RabbitTemplate rabbitTemplate;
@RabbitHandler
@RabbitListener(queues = LLM_NER_SERVICE_QUEUE)
public void receiveNerRequest(Message message) {
if (message.getMessageProperties().isRedelivered()) {
throw new AmqpRejectAndDontRequeueException("Redelivered OCR Request, aborting...");
}
LlmNerMessage llmNerMessage = parseLlmNerMessage(message);
LlmNerService.Usage usage = llmNerService.runNer(llmNerMessage);
LlmNerResponseMessage llmNerResponseMessage = new LlmNerResponseMessage(llmNerMessage.getIdentifier(),
usage.completionTokenCount(),
usage.promptTokenCount(),
Math.toIntExact(usage.durationMillis()));
rabbitTemplate.convertAndSend(LLM_NER_SERVICE_RESPONSE_QUEUE, llmNerResponseMessage);
}
@SneakyThrows
private LlmNerMessage parseLlmNerMessage(Message message) {
return mapper.readValue(message.getBody(), LlmNerMessage.class);
}
}

View File

@ -0,0 +1,41 @@
package com.knecon.fforesight.llm.service.queue;
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_DLQ;
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_QUEUE;
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_RESPONSE_QUEUE;
import org.springframework.amqp.core.Queue;
import org.springframework.amqp.core.QueueBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import lombok.RequiredArgsConstructor;
@Configuration
@RequiredArgsConstructor
public class MessagingConfiguration {
@Bean
public Queue llmNerRequestQueue() {
return QueueBuilder.durable(LLM_NER_SERVICE_QUEUE).withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", LLM_NER_SERVICE_DLQ).build();
}
@Bean
public Queue llmNerResponseQueue() {
return QueueBuilder.durable(LLM_NER_SERVICE_RESPONSE_QUEUE)
.withArgument("x-dead-letter-exchange", "")
.withArgument("x-dead-letter-routing-key", LLM_NER_SERVICE_DLQ)
.build();
}
@Bean
public Queue llmNerResponseDLQ() {
return QueueBuilder.durable(LLM_NER_SERVICE_DLQ).build();
}
}

View File

@ -0,0 +1,27 @@
package com.knecon.fforesight.llm.service.websocket;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.security.core.parameters.P;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.llm.service.services.WebSocketMessagingTemplate;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class WebSocketMessagingService implements WebSocketMessagingTemplate {
SimpMessagingTemplate messagingTemplate;
@Override
public void sendEvent(String userId, String token, Object payload) {
messagingTemplate.convertAndSendToUser(userId, token, payload);
}
}

View File

@ -0,0 +1,65 @@
spring:
mvc:
pathmatch:
matching-strategy: ant-path-matcher
async:
request-timeout: 120s
rabbitmq:
host: ${RABBITMQ_HOST:localhost}
port: ${RABBITMQ_PORT:5672}
username: ${RABBITMQ_USERNAME:user}
password: ${RABBITMQ_PASSWORD:rabbitmq}
listener:
simple:
acknowledge-mode: AUTO
concurrency: 5
retry:
enabled: true
max-attempts: 3
max-interval: 15000
prefetch: 1
llm-service:
azureOpenAiKey: "Your key here"
azureOpenAiEndpoint: "https://knecon-ca-demo.openai.azure.com/"
fforesight:
llm-service:
base-path: '/api/llm'
keycloak:
ignored-endpoints: [ '/actuator/health', '/actuator/health/**', '/api/llm', '/api/llm/', '/internal/**', '/api/llm/docs/**', '/api/llm/docs', '/api/llm/llm-websocket' ]
enabled: true
springdoc:
base-path: '/api/llm'
auth-server-url: '/auth'
enabled: true
default-client-id: 'swagger-ui-client'
default-tenant: 'fforesight'
tenants:
remote: true
springdoc:
swagger-ui:
path: ${fforesight.springdoc.base-path}/docs/swagger-ui
operations-sorter: alpha
tags-sorter: alpha
oauth:
client-id: swagger-ui-client
doc-expansion: none
config-url: ${fforesight.springdoc.base-path}/docs/swagger-config
api-docs:
path: ${fforesight.springdoc.base-path}/docs?tenantId=${fforesight.springdoc.default-tenant}
enabled: ${fforesight.springdoc.enabled}
pre-loading-enabled: true
packages-to-scan: [ 'com.knecon.fforesight.llm.service.controller.external' ]
tenant-user-management-service:
url: "http://tenant-user-management-service:8080/internal"
text-analysis-service:
url: "http://embedding-service:8080"
keyword-service:
url: "http://keyword-extraction-service:8080"
cors.enabled: true

View File

@ -9,6 +9,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.autoconfigure.actuate.observability.AutoConfigureObservability;
import org.springframework.boot.test.context.SpringBootTest;
@ -26,6 +27,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.keycloakcommons.DefaultKeyCloakCommonsAutoConfiguration;
import com.knecon.fforesight.swaggercommons.SpringDocAutoConfiguration;
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
import com.knecon.fforesight.tenantcommons.TenantContext;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import com.knecon.fforesight.tenantcommons.model.TenantResponse;
@ -64,6 +68,7 @@ public abstract class AbstractLlmServiceIntegrationTest {
@SuppressWarnings("PMD.TestClassWithoutTestCases")
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
@ImportAutoConfiguration({StorageAutoConfiguration.class, MultiTenancyAutoConfiguration.class, SpringDocAutoConfiguration.class, DefaultKeyCloakCommonsAutoConfiguration.class})
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
public static class TestConfiguration {

View File

@ -0,0 +1,83 @@
package com.knecon.fforesight.llm.service;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Set;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.knecon.fforesight.llm.service.services.LlmNerService;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.SneakyThrows;
@Disabled
public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
public static final String DOCUMENT_TEXT = "DOCUMENT_TEXT";
public static final String DOCUMENT_POSITIONS = "DOCUMENT_POSITION";
public static final String DOCUMENT_STRUCTURE = "DOCUMENT_STRUCTURE";
public static final String DOCUMENT_PAGES = "DOCUMENT_PAGES";
public static final String DOCUMENT_CHUNKS = "DOCUMENT_CHUNKS";
@Autowired
LlmNerService llmNerService;
Set<String> relevantFiles = Set.of(DOCUMENT_TEXT, DOCUMENT_POSITIONS, DOCUMENT_STRUCTURE, DOCUMENT_PAGES, DOCUMENT_CHUNKS);
@Test
@SneakyThrows
public void testLlmNer() {
Path folder = Path.of("/home/kschuettler/Downloads/New Folder (2)/2f4cc06f-d941-4f87-8928-b5d8a9476387/75ecec8c698f561c91d1a3e9f81dad7c");
LlmNerMessage message = prepStorage(folder);
llmNerService.runNer(message);
Path tmpFile = Path.of("tmp", "AAA_LLM_ENTITIES", "entities.json");
Files.createDirectories(tmpFile.getParent());
storageService.downloadTo(TEST_TENANT, message.getResultStorageId(), tmpFile.toFile());
}
private LlmNerMessage prepStorage(Path folder) throws IOException {
LlmNerMessage message = buildMessage(folder);
Files.walk(folder)
.filter(path -> path.toFile().isFile())
.filter(path -> relevantFiles.stream()
.anyMatch(filePath -> path.getFileName().toString().contains(filePath)))
.forEach(relevantFile -> storeFile(relevantFile, folder));
return message;
}
@SneakyThrows
private void storeFile(Path relevantFile, Path folder) {
try (var in = new FileInputStream(relevantFile.toFile())) {
storageService.storeObject(TenantContext.getTenantId(),
folder + relevantFiles.stream()
.filter(filePath -> relevantFile.getFileName().toString().contains(filePath))
.findFirst()
.orElseThrow(),
in);
}
}
private static LlmNerMessage buildMessage(Path folder) {
return LlmNerMessage.builder()
.chunksStorageId(folder + DOCUMENT_CHUNKS)
.documentPagesStorageId(folder + DOCUMENT_PAGES)
.documentTextStorageId(folder + DOCUMENT_TEXT)
.documentPositionStorageId(folder + DOCUMENT_POSITIONS)
.documentStructureStorageId(folder + DOCUMENT_STRUCTURE)
.resultStorageId(folder + "result")
.build();
}
}

View File

@ -0,0 +1,17 @@
server:
port: 28080
fforesight:
keycloak:
enabled: true
springdoc:
enabled: false
tenant-user-management-service.url: "http://mock.url"
text-analysis-service.url: "http://mock.url"
epam-poc-service.url: "http://mock.url"
keyword-service.url: "http://mock.url"
llm-service:
azureOpenAiKey: "Your key here"
azureOpenAiEndpoint: "https://knecon-ca-demo.openai.azure.com/"

View File

@ -0,0 +1 @@
hub.image.name.prefix=docker-dev.knecon.com/tests/

View File

@ -0,0 +1 @@
{"entities":[{"value":"Kalt R.","type":"PII","startOffset":1951,"endOffset":1958},{"value":"Kalt R.","type":"PII","startOffset":3338,"endOffset":3345},{"value":"Kalt R.","type":"PII","startOffset":3476,"endOffset":3483},{"value":"Kalt R.","type":"PII","startOffset":3821,"endOffset":3828},{"value":"Jackson W.A.","type":"PII","startOffset":2286,"endOffset":2298},{"value":"Jackson W.A.","type":"PII","startOffset":2790,"endOffset":2802},{"value":"Jackson W.A.","type":"PII","startOffset":2911,"endOffset":2923},{"value":"Jackson W.A.","type":"PII","startOffset":3096,"endOffset":3108},{"value":"Kalt R.","type":"PII","startOffset":5055,"endOffset":5062},{"value":"Kalt R.","type":"PII","startOffset":5233,"endOffset":5240},{"value":"Kalt R.","type":"PII","startOffset":5895,"endOffset":5902},{"value":"Kalt R.","type":"PII","startOffset":5909,"endOffset":5916},{"value":"Kalt R.","type":"PII","startOffset":5931,"endOffset":5938},{"value":"Kalt R.","type":"PII","startOffset":5960,"endOffset":5967},{"value":"Kalt R.","type":"PII","startOffset":5989,"endOffset":5996},{"value":"Kalt R.","type":"PII","startOffset":6018,"endOffset":6025},{"value":"Kalt R.","type":"PII","startOffset":7253,"endOffset":7260},{"value":"Kalt R.","type":"PII","startOffset":7281,"endOffset":7288},{"value":"Kalt R.","type":"PII","startOffset":7309,"endOffset":7316},{"value":"Kalt R.","type":"PII","startOffset":7337,"endOffset":7344},{"value":"Kalt R. 2009c","type":"PII","startOffset":10056,"endOffset":10069},{"value":"Kalt R.","type":"PII","startOffset":10767,"endOffset":10774},{"value":"Kalt R.","type":"PII","startOffset":10780,"endOffset":10787},{"value":"Kalt R.","type":"PII","startOffset":10802,"endOffset":10809},{"value":"Kalt R.","type":"PII","startOffset":10830,"endOffset":10837},{"value":"Kalt R.","type":"PII","startOffset":10858,"endOffset":10865},{"value":"Kalt R.","type":"PII","startOffset":10886,"endOffset":10893},{"value":"Kalt R.","type":"PII","startOffset":11980,"endOffset":11987},{"value":"Kalt R.","type":"PII","startOffset":12008,"endOffset":12015},{"value":"Kalt R.","type":"PII","startOffset":12036,"endOffset":12043},{"value":"Kalt R.","type":"PII","startOffset":12064,"endOffset":12071},{"value":"Kalt R.","type":"PII","startOffset":13814,"endOffset":13821},{"value":"Kalt R.","type":"PII","startOffset":14598,"endOffset":14605},{"value":"Kalt R.","type":"PII","startOffset":14855,"endOffset":14862},{"value":"Kalt R.","type":"PII","startOffset":15149,"endOffset":15156},{"value":"Kalt R.","type":"PII","startOffset":15481,"endOffset":15488},{"value":"Kalt R. 2009c","type":"PII","startOffset":16392,"endOffset":16405},{"value":"Kalt R.","type":"PII","startOffset":17850,"endOffset":17857},{"value":"Kalt R.","type":"PII","startOffset":18284,"endOffset":18291},{"value":"Kalt R.","type":"PII","startOffset":18932,"endOffset":18939},{"value":"Kalt R.","type":"PII","startOffset":19412,"endOffset":19419},{"value":"Kalt R.","type":"PII","startOffset":19660,"endOffset":19667},{"value":"Kalt R.","type":"PII","startOffset":19973,"endOffset":19980},{"value":"Kalt R.","type":"PII","startOffset":20246,"endOffset":20253},{"value":"Kalt R.","type":"PII","startOffset":20522,"endOffset":20529},{"value":"Jackson W.","type":"PII","startOffset":19197,"endOffset":19207},{"value":"Briswalter C.","type":"PII","startOffset":20778,"endOffset":20791},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19003,"endOffset":19052},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19529,"endOffset":19578},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":19776,"endOffset":19825},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20046,"endOffset":20095},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20362,"endOffset":20411},{"value":"Syngenta Crop Protection, Münchwilen, Switzerland","type":"ADDRESS","startOffset":20638,"endOffset":20687},{"value":"Syngenta Technology & Projects, Huddersfield, United Kingdom","type":"ADDRESS","startOffset":19265,"endOffset":19325},{"value":"Syngenta Crop Protection AG, Basel, Switzerland","type":"ADDRESS","startOffset":20809,"endOffset":20856}]}

View File

@ -1,8 +1,45 @@
#!/bin/bash
set -e
dir=${PWD##*/}
gradle assemble
buildNumber=${1:-1}
# Get the current Git branch
branch=$(git rev-parse --abbrev-ref HEAD)
gradle bootBuildImage --cleanCache --publishImage -Pversion=$USER-$buildNumber
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber"
# Get the short commit hash (first 5 characters)
commit_hash=$(git rev-parse --short=5 HEAD)
# Combine branch and commit hash
buildName="${USER}-${branch}-${commit_hash}"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
newImageName="nexus.knecon.com:5001/ff/llm-service-server:${buildName}"
echo "full image name:"
echo ${newImageName}
echo ""
if [ -z "$1" ]; then
exit 0
fi
namespace=${1}
deployment_name="llm-service"
echo "deploying to ${namespace}"
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
if [ "${newImageName}" = "${oldImageName}" ]; then
echo "Image tag did not change, redeploying..."
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
else
echo "upgrading the image tag..."
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
fi
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"

View File

@ -1 +1,7 @@
rootProject.name = "llm-service"
rootProject.name = "llm-service"
include(":llm-service-api")
include(":llm-service-server")
include(":llm-service-processor")
project(":llm-service-api").projectDir = file("llm-service/llm-service-api")
project(":llm-service-server").projectDir = file("llm-service/llm-service-server")
project(":llm-service-processor").projectDir = file("llm-service/llm-service-processor")

View File

@ -1,21 +0,0 @@
package com.knecon.fforesight.llm.service.queue;
import org.springframework.stereotype.Service;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class MessageHandler {
// @SneakyThrows
// @RabbitHandler
// @RabbitListener(queues = "#{llmServiceSettings.getRequestQueueName()}")
// public void receiveIndexingRequest(Message message) {
//
// // TODO: Do something.
// }
}

View File

@ -1,11 +0,0 @@
package com.knecon.fforesight.llm.service.queue;
import org.springframework.context.annotation.Configuration;
import lombok.RequiredArgsConstructor;
@Configuration
@RequiredArgsConstructor
public class MessagingConfiguration {
}

View File

@ -1,69 +0,0 @@
package com.knecon.fforesight.llm.service.services;
import java.util.ArrayList;
import java.util.List;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.stereotype.Service;
import com.azure.ai.openai.OpenAIAsyncClient;
import com.azure.ai.openai.OpenAIClientBuilder;
import com.azure.ai.openai.models.ChatCompletions;
import com.azure.ai.openai.models.ChatCompletionsOptions;
import com.azure.ai.openai.models.ChatMessage;
import com.azure.ai.openai.models.ChatRole;
import com.azure.core.credential.AzureKeyCredential;
import com.knecon.fforesight.llm.service.api.model.ChatEvent;
import com.knecon.fforesight.llm.service.model.SystemMessages;
import com.knecon.fforesight.llm.service.settings.LlmServiceSettings;
import com.knecon.fforesight.tenantcommons.TenantContext;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import reactor.core.publisher.Flux;
@Slf4j
@Service
@RequiredArgsConstructor
public class LlmService {
private final SimpMessagingTemplate websocketTemplate;
private final LlmServiceSettings settings;
private OpenAIAsyncClient client;
@PostConstruct
public void init() {
client = new OpenAIClientBuilder().credential(new AzureKeyCredential(settings.getAzureOpenAiKey())).endpoint(settings.getAzureOpenAiEndpoint()).buildAsyncClient();
}
@SneakyThrows
public void rulesCopilot(List<String> prompt, String userId) {
List<ChatMessage> chatMessages = new ArrayList<>();
chatMessages.add(new ChatMessage(ChatRole.SYSTEM, SystemMessages.RULES_CO_PILOT));
chatMessages.addAll(prompt.stream()
.map(p -> new ChatMessage(ChatRole.USER, p))
.toList());
ChatCompletionsOptions options = new ChatCompletionsOptions(chatMessages);
options.setStream(true);
Flux<ChatCompletions> chatCompletions = client.getChatCompletionsStream(settings.getModel(), options);
chatCompletions.subscribe(chatCompletion -> {
sendWebsocketEvent(userId,
chatCompletion.getChoices()
.get(0).getDelta().getContent());
});
}
private void sendWebsocketEvent(String userId, String token) {
websocketTemplate.convertAndSendToUser(userId, "/queue/" + TenantContext.getTenantId() + "/rules-copilot", new ChatEvent(token));
}
}