Compare commits
1 Commits
main
...
SPIKE_LLM-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
78e3f5be56 |
149
build.gradle.kts
149
build.gradle.kts
@ -1,149 +0,0 @@
|
||||
import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
|
||||
|
||||
plugins {
|
||||
java
|
||||
id("org.springframework.boot") version "3.3.2"
|
||||
id("io.spring.dependency-management") version "1.1.6"
|
||||
id("org.sonarqube") version "4.4.1.3373"
|
||||
id("io.freefair.lombok") version "8.6"
|
||||
pmd
|
||||
checkstyle
|
||||
jacoco
|
||||
}
|
||||
|
||||
group = "com.knecon.fforesight"
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
configurations {
|
||||
compileOnly {
|
||||
extendsFrom(configurations.annotationProcessor.get())
|
||||
}
|
||||
}
|
||||
|
||||
pmd {
|
||||
isConsoleOutput = true
|
||||
}
|
||||
|
||||
tasks.pmdMain {
|
||||
pmd.ruleSetFiles = files("${projectDir}/config/pmd/pmd.xml")
|
||||
}
|
||||
|
||||
tasks.pmdTest {
|
||||
pmd.ruleSetFiles = files("${projectDir}/config/pmd/test_pmd.xml")
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
tasks.jacocoTestReport {
|
||||
reports {
|
||||
xml.required.set(false)
|
||||
csv.required.set(false)
|
||||
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||
}
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
mavenCentral()
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/gindev/");
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tasks.register("publish") {
|
||||
|
||||
}
|
||||
|
||||
tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
|
||||
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
||||
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
||||
|
||||
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
network.set("host")
|
||||
}
|
||||
docker {
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
bindHostToBuilder.set(true)
|
||||
}
|
||||
verboseLogging.set(true)
|
||||
|
||||
publishRegistry {
|
||||
username.set(providers.gradleProperty("mavenUser").getOrNull())
|
||||
password.set(providers.gradleProperty("mavenPassword").getOrNull())
|
||||
email.set(providers.gradleProperty("mavenEmail").getOrNull())
|
||||
url.set("https://nexus.knecon.com:5001/")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
|
||||
exclude(group = "commons-logging", module = "commons-logging")
|
||||
}
|
||||
}
|
||||
|
||||
extra["springCloudVersion"] = "2022.0.5"
|
||||
extra["testcontainersVersion"] = "1.20.0"
|
||||
|
||||
dependencies {
|
||||
implementation("org.springframework.boot:spring-boot-starter-actuator")
|
||||
implementation("org.springframework.boot:spring-boot-starter-amqp")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign")
|
||||
implementation("org.springframework.boot:spring-boot-starter-websocket")
|
||||
implementation("org.springframework.security:spring-security-messaging:6.1.3")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.49.0")
|
||||
implementation("com.knecon.fforesight:keycloak-commons:0.29.0")
|
||||
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
|
||||
implementation("com.azure:azure-ai-openai:1.0.0-beta.5")
|
||||
developmentOnly("org.springframework.boot:spring-boot-devtools")
|
||||
annotationProcessor("org.springframework.boot:spring-boot-configuration-processor")
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-test")
|
||||
testImplementation("org.springframework.amqp:spring-rabbit-test")
|
||||
implementation("ch.qos.logback:logback-classic")
|
||||
}
|
||||
|
||||
|
||||
|
||||
dependencyManagement {
|
||||
imports {
|
||||
mavenBom("org.testcontainers:testcontainers-bom:${property("testcontainersVersion")}")
|
||||
mavenBom("org.springframework.cloud:spring-cloud-dependencies:${property("springCloudVersion")}")
|
||||
}
|
||||
}
|
||||
|
||||
tasks.withType<Test> {
|
||||
minHeapSize = "1024m"
|
||||
maxHeapSize = "2048m"
|
||||
useJUnitPlatform()
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
}
|
||||
|
||||
sonarqube {
|
||||
properties {
|
||||
providers.gradleProperty("sonarToken").getOrNull()?.let { property("sonar.login", it) }
|
||||
property("sonar.host.url", "https://sonarqube.knecon.com")
|
||||
}
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
|
||||
}
|
||||
tasks.jacocoTestReport {
|
||||
dependsOn(tasks.test) // tests are required to run before generating the report
|
||||
reports {
|
||||
xml.required.set(true)
|
||||
csv.required.set(false)
|
||||
}
|
||||
}
|
||||
7
buildSrc/build.gradle.kts
Normal file
7
buildSrc/build.gradle.kts
Normal file
@ -0,0 +1,7 @@
|
||||
plugins {
|
||||
`kotlin-dsl`
|
||||
}
|
||||
|
||||
repositories {
|
||||
gradlePluginPortal()
|
||||
}
|
||||
@ -0,0 +1,86 @@
|
||||
plugins {
|
||||
`java-library`
|
||||
`maven-publish`
|
||||
pmd
|
||||
checkstyle
|
||||
jacoco
|
||||
}
|
||||
|
||||
group = "com.knecon.fforesight"
|
||||
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
tasks.pmdMain {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||
}
|
||||
|
||||
tasks.pmdTest {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
||||
}
|
||||
|
||||
tasks.named<Test>("test") {
|
||||
useJUnitPlatform()
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
minHeapSize = "512m"
|
||||
maxHeapSize = "2048m"
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
|
||||
}
|
||||
|
||||
tasks.jacocoTestReport {
|
||||
dependsOn(tasks.test) // tests are required to run before generating the report
|
||||
reports {
|
||||
xml.required.set(true)
|
||||
csv.required.set(false)
|
||||
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||
}
|
||||
}
|
||||
|
||||
allprojects {
|
||||
|
||||
tasks.withType<Javadoc> {
|
||||
options {
|
||||
this as StandardJavadocDocletOptions
|
||||
addBooleanOption("Xdoclint:none", true)
|
||||
addStringOption("Xmaxwarns", "1")
|
||||
}
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
from(components["java"])
|
||||
}
|
||||
}
|
||||
repositories {
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull()
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
mavenCentral()
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/gindev/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull()
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull()
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -19,6 +19,7 @@
|
||||
<module name="DefaultComesLast"/>
|
||||
<module name="EmptyStatement"/>
|
||||
<module name="EqualsHashCode"/>
|
||||
<module name="ExplicitInitialization"/>
|
||||
<module name="IllegalInstantiation"/>
|
||||
<module name="ModifiedControlVariable"/>
|
||||
<module name="MultipleVariableDeclarations"/>
|
||||
|
||||
@ -1,16 +1,20 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom Rules"
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 https://pmd.sourceforge.io/ruleset_2_0_0.xsd">
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>Knecon test pmd rules</description>
|
||||
<description>
|
||||
Knecon ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="DataflowAnomalyAnalysis"/>
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
|
||||
@ -1 +1 @@
|
||||
version = 1.0-SNAPSHOT
|
||||
version = 0.1-SNAPSHOT
|
||||
|
||||
5
llm-service/llm-service-api/build.gradle.kts
Normal file
5
llm-service/llm-service-api/build.gradle.kts
Normal file
@ -0,0 +1,5 @@
|
||||
plugins {
|
||||
`maven-publish`
|
||||
id("com.knecon.fforesight.service.java-conventions")
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ChunkingResponse {
|
||||
|
||||
Map<String, String> targetFilePath;
|
||||
String responseFilePath;
|
||||
|
||||
List<ChunkingResponseData> data;
|
||||
|
||||
}
|
||||
@ -0,0 +1,26 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ChunkingResponseData {
|
||||
|
||||
Integer chunkId;
|
||||
String text;
|
||||
List<String> types;
|
||||
List<List<Integer>> treeIds;
|
||||
float[] embedding;
|
||||
Integer tokenCount;
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LlmNerEntities {
|
||||
|
||||
List<LlmNerEntity> entities;
|
||||
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LlmNerEntity {
|
||||
|
||||
String value;
|
||||
String type;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LlmNerMessage {
|
||||
|
||||
Map<String, String> identifier;
|
||||
String chunksStorageId;
|
||||
String documentStructureStorageId;
|
||||
String documentTextStorageId;
|
||||
String documentPositionStorageId;
|
||||
String documentPagesStorageId;
|
||||
String resultStorageId;
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LlmNerResponseMessage {
|
||||
|
||||
Map<String, String> identifier;
|
||||
int promptTokens;
|
||||
int completionTokens;
|
||||
int duration;
|
||||
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
public class QueueNames {
|
||||
|
||||
public static final String LLM_NER_SERVICE_QUEUE = "llm_entity_request_queue";
|
||||
public static final String LLM_NER_SERVICE_RESPONSE_QUEUE = "llm_entity_response_queue";
|
||||
public static final String LLM_NER_SERVICE_DLQ = "llm_entity_dead_letter_queue";
|
||||
|
||||
}
|
||||
22
llm-service/llm-service-processor/build.gradle.kts
Normal file
22
llm-service/llm-service-processor/build.gradle.kts
Normal file
@ -0,0 +1,22 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.service.java-conventions")
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
|
||||
}
|
||||
}
|
||||
extra["springCloudVersion"] = "2022.0.5"
|
||||
extra["testcontainersVersion"] = "1.20.0"
|
||||
|
||||
dependencies {
|
||||
implementation(project(":llm-service-api"))
|
||||
implementation("com.knecon.fforesight:layoutparser-service-internal-api:0.159.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.49.0")
|
||||
implementation("org.springframework.boot:spring-boot-starter:3.1.1")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.21.0")
|
||||
implementation("com.azure:azure-ai-openai:1.0.0-beta.10")
|
||||
implementation("ch.qos.logback:logback-classic:1.5.7")
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
@EnableConfigurationProperties(LlmServiceSettings.class)
|
||||
public class LlmServiceConfiguration {
|
||||
|
||||
}
|
||||
@ -0,0 +1,70 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import com.google.common.base.Functions;
|
||||
|
||||
public class ConsecutiveBoundaryCollector implements Collector<TextRange, List<TextRange>, List<TextRange>> {
|
||||
|
||||
@Override
|
||||
public Supplier<List<TextRange>> supplier() {
|
||||
|
||||
return LinkedList::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<List<TextRange>, TextRange> accumulator() {
|
||||
|
||||
return (existingList, boundary) -> {
|
||||
if (existingList.isEmpty()) {
|
||||
existingList.add(boundary);
|
||||
return;
|
||||
}
|
||||
|
||||
TextRange prevTextRange = existingList.get(existingList.size() - 1);
|
||||
if (prevTextRange.end() > boundary.start()) {
|
||||
throw new IllegalArgumentException(String.format("Can't concatenate %s and %s. Boundaries must be ordered!", prevTextRange, boundary));
|
||||
}
|
||||
|
||||
if (prevTextRange.end() == boundary.start()) {
|
||||
existingList.remove(existingList.size() - 1);
|
||||
existingList.add(TextRange.merge(List.of(prevTextRange, boundary)));
|
||||
} else {
|
||||
existingList.add(boundary);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<List<TextRange>> combiner() {
|
||||
|
||||
return (list1, list2) -> {
|
||||
list1.addAll(list2);
|
||||
return list1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<List<TextRange>, List<TextRange>> finisher() {
|
||||
|
||||
return Functions.identity();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.base.Functions;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.ConcatenatedTextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@NoArgsConstructor
|
||||
public class ConsecutiveTextBlockCollector implements Collector<TextBlock, List<ConcatenatedTextBlock>, List<TextBlock>> {
|
||||
|
||||
@Override
|
||||
public Supplier<List<ConcatenatedTextBlock>> supplier() {
|
||||
|
||||
return LinkedList::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<List<ConcatenatedTextBlock>, TextBlock> accumulator() {
|
||||
|
||||
return (existingList, textBlock) -> {
|
||||
if (existingList.isEmpty()) {
|
||||
ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty();
|
||||
ctb.concat(textBlock);
|
||||
existingList.add(ctb);
|
||||
return;
|
||||
}
|
||||
|
||||
ConcatenatedTextBlock prevBlock = existingList.get(existingList.size() - 1);
|
||||
|
||||
if (prevBlock.getTextRange().end() == textBlock.getTextRange().start()) {
|
||||
prevBlock.concat(textBlock);
|
||||
} else {
|
||||
ConcatenatedTextBlock ctb = ConcatenatedTextBlock.empty();
|
||||
ctb.concat(textBlock);
|
||||
existingList.add(ctb);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<List<ConcatenatedTextBlock>> combiner() {
|
||||
|
||||
return (list1, list2) -> Stream.concat(list1.stream(), list2.stream())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<List<ConcatenatedTextBlock>, List<TextBlock>> finisher() {
|
||||
|
||||
return a -> a.stream()
|
||||
.map(tb -> (TextBlock) tb)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentData implements Serializable {
|
||||
|
||||
DocumentPage[] documentPages;
|
||||
DocumentTextData[] documentTextData;
|
||||
DocumentPositionData[] documentPositionData;
|
||||
DocumentStructure documentStructure;
|
||||
|
||||
}
|
||||
@ -0,0 +1,229 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Document;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Footer;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Header;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Headline;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Image;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Paragraph;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Section;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.SuperSection;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Table;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DocumentGraphMapper {
|
||||
|
||||
public Document toDocumentGraph(DocumentData documentData) {
|
||||
|
||||
Document document = new Document();
|
||||
DocumentTree documentTree = new DocumentTree(document);
|
||||
Context context = new Context(documentData, documentTree);
|
||||
|
||||
context.pageData.addAll(Arrays.stream(documentData.getDocumentPages())
|
||||
.map(DocumentGraphMapper::buildPage)
|
||||
.toList());
|
||||
|
||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
|
||||
|
||||
document.setDocumentTree(context.documentTree);
|
||||
document.setPages(new HashSet<>(context.pageData));
|
||||
document.setNumberOfPages(documentData.getDocumentPages().length);
|
||||
|
||||
document.setTextBlock(document.getTextBlock());
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private List<DocumentTree.Entry> buildEntries(List<DocumentStructure.EntryData> entries, Context context) {
|
||||
|
||||
List<DocumentTree.Entry> newEntries = new ArrayList<>(entries.size());
|
||||
for (DocumentStructure.EntryData entryData : entries) {
|
||||
|
||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||
.map(pageNumber -> getPage(pageNumber, context))
|
||||
.toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case SUPER_SECTION -> buildSuperSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||
case HEADLINE -> buildHeadline(context);
|
||||
case HEADER -> buildHeader(context);
|
||||
case FOOTER -> buildFooter(context);
|
||||
case TABLE -> buildTable(context, entryData.getProperties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties());
|
||||
case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||
};
|
||||
|
||||
if (entryData.getAtomicBlockIds().length > 0) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
}
|
||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
||||
.toList();
|
||||
if (entryData.getEngines() != null) {
|
||||
entryData.getEngines()
|
||||
.forEach(node::addEngine);
|
||||
} else {
|
||||
entryData.setEngines(Collections.emptySet());
|
||||
}
|
||||
node.setTreeId(treeId);
|
||||
|
||||
switch (entryData.getType()) {
|
||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
|
||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
|
||||
|
||||
private Headline buildHeadline(Context context) {
|
||||
|
||||
return Headline.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Image buildImage(Context context, Map<String, String> properties, Long[] pageNumbers) {
|
||||
|
||||
assert pageNumbers.length == 1;
|
||||
Page page = getPage(pageNumbers[0], context);
|
||||
var builder = Image.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).page(page).build();
|
||||
}
|
||||
|
||||
|
||||
private TableCell buildTableCell(Context context, Map<String, String> properties) {
|
||||
|
||||
TableCell.TableCellBuilder<?, ?> builder = TableCell.builder();
|
||||
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Table buildTable(Context context, Map<String, String> properties) {
|
||||
|
||||
Table.TableBuilder builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Footer buildFooter(Context context) {
|
||||
|
||||
return Footer.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Header buildHeader(Context context) {
|
||||
|
||||
return Header.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private Section buildSection(Context context) {
|
||||
|
||||
return Section.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private SuperSection buildSuperSection(Context context) {
|
||||
|
||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||
|
||||
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||
|
||||
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
||||
|
||||
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
||||
return duplicatedParagraph;
|
||||
}
|
||||
|
||||
return Paragraph.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.documentPositionData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.documentTextData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
}
|
||||
|
||||
|
||||
private Page buildPage(DocumentPage p) {
|
||||
|
||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
private Page getPage(Long pageIndex, Context context) {
|
||||
|
||||
Page page = context.pageData.get(Math.toIntExact(pageIndex) - 1);
|
||||
assert page.getNumber() == Math.toIntExact(pageIndex);
|
||||
return page;
|
||||
}
|
||||
|
||||
|
||||
static final class Context {
|
||||
|
||||
private final DocumentTree documentTree;
|
||||
private final List<Page> pageData;
|
||||
private final List<DocumentTextData> documentTextData;
|
||||
private final List<DocumentPositionData> documentPositionData;
|
||||
|
||||
|
||||
Context(DocumentData documentData, DocumentTree documentTree) {
|
||||
|
||||
this.documentTree = documentTree;
|
||||
this.pageData = new ArrayList<>();
|
||||
this.documentTextData = Arrays.stream(documentData.getDocumentTextData())
|
||||
.toList();
|
||||
this.documentPositionData = Arrays.stream(documentData.getDocumentPositionData())
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,371 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Document;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.NodeType;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Table;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode
|
||||
public class DocumentTree {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public DocumentTree(Document document) {
|
||||
|
||||
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return allEntriesInOrder().map(Entry::getNode)
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root;
|
||||
for (int id : treeId) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(getParentId(treeId));
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> treeId) {
|
||||
|
||||
return !treeId.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(treeId).children.stream()
|
||||
.map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds all child nodes of the specified entry, whose nodes textRange intersects the given textRange. It achieves this by finding the first entry, whose textRange contains the start idx of the TextRange using a binary search.
|
||||
* It then iterates over the remaining children adding them to the intersections, until one does not contain the end of the TextRange. All intersected Entries are returned as SemanticNodes.
|
||||
*
|
||||
* @param treeId the treeId of the Entry whose children shall be checked.
|
||||
* @param textRange The TextRange to find intersecting childNodes for.
|
||||
* @return A list of all SemanticNodes, that are direct children of the specified Entry, whose TextRange intersects the given TextRange
|
||||
*/
|
||||
public List<SemanticNode> findIntersectingChildNodes(List<Integer> treeId, TextRange textRange) {
|
||||
|
||||
List<Entry> childEntries = getEntryById(treeId).getChildren();
|
||||
List<SemanticNode> intersectingChildEntries = new LinkedList<>();
|
||||
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
|
||||
if (startIdx < 0) {
|
||||
return intersectingChildEntries;
|
||||
}
|
||||
for (int i = startIdx; i < childEntries.size(); i++) {
|
||||
if (childEntries.get(i).getNode().getTextRange().start() < textRange.end()) {
|
||||
intersectingChildEntries.add(childEntries.get(i).getNode());
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return intersectingChildEntries;
|
||||
}
|
||||
|
||||
|
||||
public Optional<SemanticNode> findFirstContainingChild(List<Integer> treeId, TextRange textRange) {
|
||||
|
||||
List<Entry> childEntries = getEntryById(treeId).getChildren();
|
||||
int startIdx = findFirstIdxOfContainingChildBinarySearch(childEntries, textRange.start());
|
||||
if (startIdx < 0) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
if (childEntries.get(startIdx).getNode().getTextRange().contains(textRange.end())) {
|
||||
return Optional.of(childEntries.get(startIdx).getNode());
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public Optional<TableCell> findTableCellInTable(List<Integer> treeId, int start, int end) {
|
||||
|
||||
return findTableCellInTableRecursively(getEntryById(treeId).getChildren(), start, end);
|
||||
}
|
||||
|
||||
|
||||
private Optional<TableCell> findTableCellInTableRecursively(List<Entry> entries, int start, int end) {
|
||||
|
||||
int startIdx = findFirstIdxOfContainingChildBinarySearch(entries, start);
|
||||
if (startIdx < 0) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Entry entry = entries.get(startIdx);
|
||||
|
||||
if (entry.getNode().getTextRange().contains(end) && entry.getNode() instanceof TableCell tableCell) {
|
||||
if (!entry.getNode().isLeaf()) {
|
||||
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
|
||||
if (foundInChildren.isPresent()) {
|
||||
return foundInChildren;
|
||||
}
|
||||
}
|
||||
return Optional.of(tableCell);
|
||||
}
|
||||
|
||||
if (!entry.getNode().isLeaf()) {
|
||||
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
|
||||
if (foundInChildren.isPresent()) {
|
||||
return foundInChildren;
|
||||
}
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private int findFirstIdxOfContainingChildBinarySearch(List<Entry> childNodes, int start) {
|
||||
|
||||
int low = 0;
|
||||
int high = childNodes.size() - 1;
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
TextRange range = childNodes.get(mid).getNode().getTextRange();
|
||||
if (range.start() > start) {
|
||||
high = mid - 1;
|
||||
} else if (range.end() <= start) {
|
||||
low = mid + 1;
|
||||
} else {
|
||||
return mid;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||
|
||||
return getEntryById(treeId).children.stream()
|
||||
.filter(entry -> entry.node.getType().equals(nodeType))
|
||||
.map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
throw new UnsupportedOperationException("Root has no parent!");
|
||||
}
|
||||
if (treeId.size() < 2) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return treeId.subList(0, treeId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Optional<SemanticNode> getNextSibling(List<Integer> treeId) {
|
||||
|
||||
var siblingTreeId = getNextSiblingId(treeId);
|
||||
if (!entryExists(siblingTreeId)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return Optional.of(getEntryById(siblingTreeId).getNode());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> getNextSiblingId(List<Integer> treeId) {
|
||||
|
||||
List<Integer> siblingTreeId = new LinkedList<>();
|
||||
for (int i = 0; i < treeId.size() - 1; i++) {
|
||||
siblingTreeId.add(treeId.get(i));
|
||||
}
|
||||
siblingTreeId.add(treeId.get(treeId.size() - 1) + 1);
|
||||
return siblingTreeId;
|
||||
}
|
||||
|
||||
|
||||
public Optional<SemanticNode> getPreviousSibling(List<Integer> treeId) {
|
||||
|
||||
var siblingTreeId = getPreviousSiblingId(treeId);
|
||||
if (!entryExists(siblingTreeId)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return Optional.of(getEntryById(siblingTreeId).getNode());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> getPreviousSiblingId(List<Integer> treeId) {
|
||||
|
||||
List<Integer> siblingTreeId = new LinkedList<>();
|
||||
for (int i = 0; i < treeId.size() - 1; i++) {
|
||||
siblingTreeId.add(treeId.get(i));
|
||||
}
|
||||
siblingTreeId.add(treeId.get(treeId.size() - 1) - 1);
|
||||
return siblingTreeId;
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root;
|
||||
for (int id : treeId) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> mainEntries() {
|
||||
|
||||
return root.children.stream();
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allEntriesInOrder() {
|
||||
|
||||
return Stream.of(root)
|
||||
.flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).children.stream()
|
||||
.flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n",
|
||||
allEntriesInOrder().map(Entry::toString)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry),
|
||||
entry.children.stream()
|
||||
.flatMap(DocumentTree::flatten));
|
||||
}
|
||||
|
||||
|
||||
public SemanticNode getHighestParentById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root.node;
|
||||
}
|
||||
return root.children.get(treeId.get(0)).node;
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> treeId;
|
||||
SemanticNode node;
|
||||
@Builder.Default
|
||||
List<Entry> children = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
public NodeType getType() {
|
||||
|
||||
return node.getType();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,72 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Image;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.ImageType;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Table;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PropertiesMapper {
|
||||
|
||||
public void parseImageProperties(Map<String, String> properties, Image.ImageBuilder builder) {
|
||||
|
||||
builder.imageType(ImageType.fromString(properties.get(DocumentStructure.ImageProperties.IMAGE_TYPE)));
|
||||
builder.transparent(Boolean.parseBoolean(properties.get(DocumentStructure.ImageProperties.TRANSPARENT)));
|
||||
builder.position(parseRectangle2D(properties.get(DocumentStructure.ImageProperties.POSITION)));
|
||||
builder.id(properties.get(DocumentStructure.ImageProperties.ID));
|
||||
}
|
||||
|
||||
|
||||
public void parseTableCellProperties(Map<String, String> properties, TableCell.TableCellBuilder builder) {
|
||||
|
||||
builder.row(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.ROW)));
|
||||
builder.col(Integer.parseInt(properties.get(DocumentStructure.TableCellProperties.COL)));
|
||||
builder.header(Boolean.parseBoolean(properties.get(DocumentStructure.TableCellProperties.HEADER)));
|
||||
builder.bBox(parseRectangle2D(properties.get(DocumentStructure.TableCellProperties.B_BOX)));
|
||||
}
|
||||
|
||||
|
||||
public void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
||||
|
||||
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(DocumentStructure.RECTANGLE_DELIMITER))
|
||||
.map(Float::parseFloat)
|
||||
.toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
||||
|
||||
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||
}
|
||||
|
||||
|
||||
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
|
||||
|
||||
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||
}
|
||||
|
||||
|
||||
public static Long[] toLongArray(String ids) {
|
||||
|
||||
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(","))
|
||||
.map(Long::valueOf)
|
||||
.toArray(Long[]::new);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,175 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream()
|
||||
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
|
||||
.stream())
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static Rectangle2D rectangle2DBBox(Collection<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream()
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If two rectangles are further apart than five times the average width of a rectangle, a gap is inserted.
|
||||
*
|
||||
* @param rectangle2DList A list of rectangles to combine
|
||||
* @return A list of rectangles which are combined if they are closer than the split threshold
|
||||
*/
|
||||
public static List<Rectangle2D> rectangleBBoxWithGaps(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
if (rectangle2DList.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
double splitThreshold = rectangle2DList.stream()
|
||||
.mapToDouble(RectangularShape::getWidth).average()
|
||||
.orElse(5) * 5.0;
|
||||
|
||||
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
|
||||
rectangleListsWithGaps.add(rectangleListWithoutGaps);
|
||||
Rectangle2D previousRectangle = rectangle2DList.get(0);
|
||||
for (Rectangle2D currentRectangle : rectangle2DList) {
|
||||
if (Math.abs(currentRectangle.getMinX() - previousRectangle.getMaxX()) > splitThreshold) {
|
||||
rectangleListWithoutGaps = new LinkedList<>();
|
||||
rectangleListWithoutGaps.add(currentRectangle);
|
||||
rectangleListsWithGaps.add(rectangleListWithoutGaps);
|
||||
previousRectangle = currentRectangle;
|
||||
} else {
|
||||
rectangleListWithoutGaps.add(currentRectangle);
|
||||
previousRectangle = currentRectangle;
|
||||
}
|
||||
}
|
||||
return rectangleListsWithGaps.stream()
|
||||
.map(RectangleTransformations::rectangle2DBBox)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
||||
|
||||
return new Rectangle2DBBoxCollector();
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<BBox> supplier() {
|
||||
|
||||
return BBox::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<BBox, Rectangle2D> accumulator() {
|
||||
|
||||
return BBox::addRectangle;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<BBox> combiner() {
|
||||
|
||||
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
|
||||
Math.min(b1.lowerLeftY, b2.lowerLeftY),
|
||||
Math.max(b1.upperRightX, b2.upperRightX),
|
||||
Math.max(b1.upperRightY, b2.upperRightY));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<BBox, Rectangle2D> finisher() {
|
||||
|
||||
return BBox::toRectangle2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
private static class BBox {
|
||||
|
||||
Double lowerLeftX;
|
||||
Double lowerLeftY;
|
||||
Double upperRightX;
|
||||
Double upperRightY;
|
||||
|
||||
|
||||
public Rectangle2D toRectangle2D() {
|
||||
|
||||
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
|
||||
return new Rectangle2D.Double(0, 0, 0, 0);
|
||||
}
|
||||
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
|
||||
}
|
||||
|
||||
|
||||
public void addRectangle(Rectangle2D rectangle2D) {
|
||||
|
||||
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
|
||||
if (this.lowerLeftX == null) {
|
||||
this.lowerLeftX = lowerLeftX;
|
||||
} else if (this.lowerLeftX > lowerLeftX) {
|
||||
this.lowerLeftX = lowerLeftX;
|
||||
}
|
||||
if (this.lowerLeftY == null) {
|
||||
this.lowerLeftY = lowerLeftY;
|
||||
} else if (this.lowerLeftY > lowerLeftY) {
|
||||
this.lowerLeftY = lowerLeftY;
|
||||
}
|
||||
if (this.upperRightX == null) {
|
||||
this.upperRightX = upperRightX;
|
||||
} else if (this.upperRightX < upperRightX) {
|
||||
this.upperRightX = upperRightX;
|
||||
}
|
||||
if (this.upperRightY == null) {
|
||||
this.upperRightY = upperRightY;
|
||||
} else if (this.upperRightY < upperRightY) {
|
||||
this.upperRightY = upperRightY;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,250 @@
|
||||
package com.knecon.fforesight.llm.service.document;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
|
||||
/**
|
||||
* Represents a range of text defined by a start and end index.
|
||||
* Provides functionality to check containment, intersection, and to adjust ranges based on specified conditions.
|
||||
*/
|
||||
@Setter
|
||||
@EqualsAndHashCode
|
||||
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
||||
public class TextRange implements Comparable<TextRange> {
|
||||
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a TextRange with specified start and end indexes.
|
||||
*
|
||||
* @param start The starting index of the range.
|
||||
* @param end The ending index of the range.
|
||||
* @throws IllegalArgumentException If start is greater than end.
|
||||
*/
|
||||
public TextRange(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the length of the text range.
|
||||
*
|
||||
* @return The length of the range.
|
||||
*/
|
||||
public int length() {
|
||||
|
||||
return end - start;
|
||||
}
|
||||
|
||||
|
||||
public int start() {
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
public int end() {
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this {@link TextRange} fully contains another TextRange.
|
||||
*
|
||||
* @param textRange The {@link TextRange} to check.
|
||||
* @return true if this range contains the specified range, false otherwise.
|
||||
*/
|
||||
public boolean contains(TextRange textRange) {
|
||||
|
||||
return start <= textRange.start() && textRange.end() <= end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this {@link TextRange} is fully contained by another TextRange.
|
||||
*
|
||||
* @param textRange The {@link TextRange} to check against.
|
||||
* @return true if this range is contained by the specified range, false otherwise.
|
||||
*/
|
||||
public boolean containedBy(TextRange textRange) {
|
||||
|
||||
return textRange.contains(this);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this {@link TextRange} contains another range specified by start and end indices.
|
||||
*
|
||||
* @param start The starting index of the range to check.
|
||||
* @param end The ending index of the range to check.
|
||||
* @return true if this range fully contains the specified range, false otherwise.
|
||||
* @throws IllegalArgumentException If the start index is greater than the end index.
|
||||
*/
|
||||
public boolean contains(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return this.start <= start && end <= this.end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this {@link TextRange} is fully contained within another range specified by start and end indices.
|
||||
*
|
||||
* @param start The starting index of the outer range.
|
||||
* @param end The ending index of the outer range.
|
||||
* @return true if this range is fully contained within the specified range, false otherwise.
|
||||
* @throws IllegalArgumentException If the start index is greater than the end index.
|
||||
*/
|
||||
public boolean containedBy(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return start <= this.start && this.end <= end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the specified index is within this {@link TextRange}.
|
||||
*
|
||||
* @param index The index to check.
|
||||
* @return true if the index is within the range (inclusive of the start and exclusive of the end), false otherwise.
|
||||
*/
|
||||
public boolean contains(int index) {
|
||||
|
||||
return start <= index && index < end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this {@link TextRange} intersects with another {@link TextRange}.
|
||||
*
|
||||
* @param textRange The {@link TextRange} to check for intersection.
|
||||
* @return true if the ranges intersect, false otherwise.
|
||||
*/
|
||||
public boolean intersects(TextRange textRange) {
|
||||
|
||||
return textRange.start() < this.end && this.start < textRange.end();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Splits this TextRange into multiple ranges based on a list of indices.
|
||||
*
|
||||
* @param splitIndices The indices at which to split the range.
|
||||
* @return A list of TextRanges resulting from the split.
|
||||
* @throws IndexOutOfBoundsException If any split index is outside this TextRange.
|
||||
*/
|
||||
public List<TextRange> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream()
|
||||
.anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s",
|
||||
splitIndices.stream()
|
||||
.filter(idx -> !this.contains(idx))
|
||||
.toList(),
|
||||
this));
|
||||
}
|
||||
List<TextRange> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int i = 0, splitIndicesSize = splitIndices.size(); i < splitIndicesSize; i++) {
|
||||
int splitIndex = splitIndices.get(i);
|
||||
|
||||
// skip split if it would produce a boundary of length 0
|
||||
if (splitIndex == previousIndex) {
|
||||
continue;
|
||||
}
|
||||
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merges a collection of TextRanges into a single Text range encompassing all.
|
||||
*
|
||||
* @param boundaries The collection of TextRanges to merge.
|
||||
* @return A new TextRange covering the entire span of the given ranges.
|
||||
* @throws IllegalArgumentException If boundaries are empty.
|
||||
*/
|
||||
public static TextRange merge(Collection<TextRange> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream()
|
||||
.mapToInt(TextRange::start)
|
||||
.min()
|
||||
.orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream()
|
||||
.mapToInt(TextRange::end)
|
||||
.max()
|
||||
.orElseThrow(IllegalArgumentException::new);
|
||||
return new TextRange(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return format("Boundary [%d|%d)", start, end);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(TextRange textRange) {
|
||||
|
||||
if (end < textRange.end() && start < textRange.start()) {
|
||||
return -1;
|
||||
}
|
||||
if (start > textRange.start() && end > textRange.end()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without trailing or preceding whitespaces.
|
||||
*
|
||||
* @param textBlock TextBlock to check whitespaces against
|
||||
* @return Trimmed boundary
|
||||
*/
|
||||
public TextRange trim(TextBlock textBlock) {
|
||||
|
||||
if (this.length() == 0) {
|
||||
return this;
|
||||
}
|
||||
|
||||
int trimmedStart = this.start;
|
||||
while (textBlock.containsIndex(trimmedStart) && trimmedStart < end && Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
||||
trimmedStart++;
|
||||
}
|
||||
|
||||
int trimmedEnd = this.end;
|
||||
while (textBlock.containsIndex(trimmedEnd - 1) && trimmedStart < trimmedEnd && Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
|
||||
trimmedEnd--;
|
||||
}
|
||||
|
||||
return new TextRange(trimmedStart, Math.max(trimmedEnd, trimmedStart));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
package com.knecon.fforesight.llm.service.document.entity;
|
||||
|
||||
public enum EntityType {
|
||||
ENTITY,
|
||||
HINT,
|
||||
RECOMMENDATION,
|
||||
FALSE_POSITIVE,
|
||||
FALSE_RECOMMENDATION,
|
||||
DICTIONARY_REMOVAL
|
||||
}
|
||||
@ -0,0 +1,30 @@
|
||||
package com.knecon.fforesight.llm.service.document.entity;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
|
||||
public interface IEntity {
|
||||
|
||||
/**
|
||||
* Gets the value of this entity as a string.
|
||||
*
|
||||
* @return The string value.
|
||||
*/
|
||||
String getValue();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the range of text in the document associated with this entity.
|
||||
*
|
||||
* @return The text range.
|
||||
*/
|
||||
TextRange getTextRange();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the type of this entity.
|
||||
*
|
||||
* @return The entity type.
|
||||
*/
|
||||
String type();
|
||||
|
||||
}
|
||||
@ -0,0 +1,46 @@
|
||||
package com.knecon.fforesight.llm.service.document.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public final class IdBuilder {
|
||||
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
|
||||
|
||||
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
|
||||
|
||||
return buildId(pages.stream()
|
||||
.map(Page::getNumber)
|
||||
.collect(Collectors.toList()), rectanglesPerLine, type, entityType);
|
||||
}
|
||||
|
||||
|
||||
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(type).append(entityType);
|
||||
List<Integer> sortedPageNumbers = pageNumbers.stream()
|
||||
.sorted(Comparator.comparingInt(Integer::intValue))
|
||||
.toList();
|
||||
sortedPageNumbers.forEach(sb::append);
|
||||
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
|
||||
.append(Math.round(rectangle2D.getY()))
|
||||
.append(Math.round(rectangle2D.getWidth()))
|
||||
.append(Math.round(rectangle2D.getHeight())));
|
||||
|
||||
return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.llm.service.document.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class PositionOnPage {
|
||||
|
||||
// Each entry in this list corresponds to an entry in the redaction log, this means:
|
||||
// A single entity might be represented by multiple redaction log entries
|
||||
// This is due to the RedactionLog only being able to handle a single page per entry.
|
||||
final String id;
|
||||
Page page;
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
}
|
||||
@ -0,0 +1,248 @@
|
||||
package com.knecon.fforesight.llm.service.document.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
||||
public class TextEntity implements IEntity {
|
||||
|
||||
// primary key
|
||||
@EqualsAndHashCode.Include
|
||||
final String id;
|
||||
// primary key end
|
||||
|
||||
TextRange textRange;
|
||||
@Builder.Default
|
||||
List<TextRange> duplicateTextRanges = new ArrayList<>();
|
||||
String type; // TODO: make final once ManualChangesApplicationService::recategorize is deleted
|
||||
final EntityType entityType;
|
||||
|
||||
// inferred on graph insertion
|
||||
String value;
|
||||
String textBefore;
|
||||
String textAfter;
|
||||
@Builder.Default
|
||||
Set<Page> pages = new HashSet<>();
|
||||
List<PositionOnPage> positionsOnPagePerPage;
|
||||
@Builder.Default
|
||||
List<SemanticNode> intersectingNodes = new LinkedList<>();
|
||||
SemanticNode deepestFullyContainingNode;
|
||||
|
||||
|
||||
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
return TextEntity.builder().id(buildId(node, textRange, type, entityType)).type(type).entityType(entityType).textRange(textRange).build();
|
||||
}
|
||||
|
||||
|
||||
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id) {
|
||||
|
||||
return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build();
|
||||
}
|
||||
|
||||
|
||||
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType, String id, String manualOverwriteSection) {
|
||||
|
||||
return TextEntity.builder().id(id).type(type).entityType(entityType).textRange(textRange).build();
|
||||
}
|
||||
|
||||
|
||||
private static String buildId(SemanticNode node, TextRange textRange, String type, EntityType entityType) {
|
||||
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = node.getPositionsPerPage(textRange);
|
||||
return IdBuilder.buildId(rectanglesPerLinePerPage.keySet(),
|
||||
rectanglesPerLinePerPage.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.toList(),
|
||||
type,
|
||||
entityType.name());
|
||||
}
|
||||
|
||||
|
||||
public void addTextRange(TextRange textRange) {
|
||||
|
||||
duplicateTextRanges.add(textRange);
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
|
||||
|
||||
return intersectingNodes.stream()
|
||||
.anyMatch(clazz::isInstance);
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNode(SemanticNode semanticNode) {
|
||||
|
||||
return intersectingNodes.stream()
|
||||
.anyMatch(node -> node.equals(semanticNode));
|
||||
}
|
||||
|
||||
|
||||
public boolean isType(String type) {
|
||||
|
||||
return type().equals(type);
|
||||
}
|
||||
|
||||
|
||||
public boolean isAnyType(List<String> types) {
|
||||
|
||||
return types.contains(type());
|
||||
}
|
||||
|
||||
|
||||
public void addIntersectingNode(SemanticNode containingNode) {
|
||||
|
||||
intersectingNodes.add(containingNode);
|
||||
}
|
||||
|
||||
|
||||
public String getValueWithLineBreaks() {
|
||||
|
||||
return getDeepestFullyContainingNode().getTextBlock().subSequenceWithLineBreaks(getTextRange());
|
||||
}
|
||||
|
||||
|
||||
public void removeFromGraph() {
|
||||
|
||||
intersectingNodes.forEach(node -> node.getEntities().remove(this));
|
||||
pages.forEach(page -> page.getEntities().remove(this));
|
||||
intersectingNodes = new LinkedList<>();
|
||||
deepestFullyContainingNode = null;
|
||||
pages = new HashSet<>();
|
||||
}
|
||||
|
||||
|
||||
public List<PositionOnPage> getPositionsOnPagePerPage() {
|
||||
|
||||
if (positionsOnPagePerPage == null || positionsOnPagePerPage.isEmpty()) {
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
|
||||
|
||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||
|
||||
positionsOnPagePerPage = rectanglesPerLinePerPage.entrySet()
|
||||
.stream()
|
||||
.map(entry -> buildPositionOnPage(firstPage, id, entry))
|
||||
.toList();
|
||||
}
|
||||
return positionsOnPagePerPage;
|
||||
}
|
||||
|
||||
|
||||
private static PositionOnPage buildPositionOnPage(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
|
||||
|
||||
if (entry.getKey().equals(firstPage)) {
|
||||
return new PositionOnPage(id, entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
return new PositionOnPage(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(TextEntity textEntity) {
|
||||
|
||||
return textEntity.contains(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TextEntity textEntity) {
|
||||
|
||||
if (this.textRange.contains(textEntity.getTextRange())) {
|
||||
return true;
|
||||
}
|
||||
|
||||
List<TextRange> textEntityDuplicateRanges = textEntity.getDuplicateTextRanges();
|
||||
// use optimized indexed loops for extra performance boost
|
||||
for (int i = 0, duplicateTextRangesSize = duplicateTextRanges.size(); i < duplicateTextRangesSize; i++) {
|
||||
TextRange duplicateTextRange = duplicateTextRanges.get(i);
|
||||
if (duplicateTextRange.contains(textEntity.getTextRange())) {
|
||||
return true;
|
||||
}
|
||||
for (int j = 0, textEntityDuplicateRangesSize = textEntityDuplicateRanges.size(); j < textEntityDuplicateRangesSize; j++) {
|
||||
TextRange otherRange = textEntityDuplicateRanges.get(j);
|
||||
if (duplicateTextRange.contains(otherRange)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(TextEntity textEntity) {
|
||||
|
||||
return this.textRange.intersects(textEntity.getTextRange()) //
|
||||
|| duplicateTextRanges.stream()
|
||||
.anyMatch(duplicateTextRange -> duplicateTextRange.intersects(textEntity.textRange)) //
|
||||
|| duplicateTextRanges.stream()
|
||||
.anyMatch(duplicateTextRange -> textEntity.getDuplicateTextRanges()
|
||||
.stream()
|
||||
.anyMatch(duplicateTextRange::intersects));
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesAnnotationId(String manualRedactionId) {
|
||||
|
||||
return getPositionsOnPagePerPage().stream()
|
||||
.anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("Entity[\"");
|
||||
sb.append(value);
|
||||
sb.append("\", ");
|
||||
sb.append(textRange);
|
||||
sb.append(", pages[");
|
||||
pages.forEach(page -> {
|
||||
sb.append(page.getNumber());
|
||||
sb.append(", ");
|
||||
});
|
||||
sb.delete(sb.length() - 2, sb.length());
|
||||
sb.append("], type = \"");
|
||||
sb.append(type());
|
||||
sb.append("\", EntityType.");
|
||||
sb.append(entityType);
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String type() {
|
||||
|
||||
return getType();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,73 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.DocumentTree;
|
||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public abstract class AbstractSemanticNode implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
@EqualsAndHashCode.Include
|
||||
List<Integer> treeId;
|
||||
|
||||
TextBlock textBlock;
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,171 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.DocumentTree;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
/**
|
||||
* Represents the entire document as a node within the document's semantic structure.
|
||||
*/
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Document extends AbstractSemanticNode {
|
||||
|
||||
Set<Page> pages;
|
||||
Integer numberOfPages;
|
||||
|
||||
@Builder.Default
|
||||
static final SectionIdentifier sectionIdentifier = SectionIdentifier.document();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.DOCUMENT;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the sections of the document as a list.
|
||||
*
|
||||
* @return A list of all sections within the document.
|
||||
*/
|
||||
public List<Section> getAllSections() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the main sections of the document as a list.
|
||||
*
|
||||
* @return A list of main sections within the document
|
||||
* @deprecated This method is marked for removal.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
*/
|
||||
@Deprecated(forRemoval = true)
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
|
||||
*
|
||||
* @return A list of all children of type SECTION or SUPER_SECTION.
|
||||
*/
|
||||
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
|
||||
|
||||
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all terminal (leaf) text blocks within the document in their natural order.
|
||||
*
|
||||
* @return A stream of terminal {@link TextBlock}.
|
||||
*/
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getTextBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTreeId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTreeId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
return sectionIdentifier;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
|
||||
.findFirst()
|
||||
.orElseGet(Headline::empty);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all nodes within the document, regardless of type, in their natural order.
|
||||
*
|
||||
* @return A stream of all {@link SemanticNode} within the document.
|
||||
*/
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return getDocumentTree().allEntriesInOrder()
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all image nodes contained within the document.
|
||||
*
|
||||
* @return A stream of {@link Image} nodes.
|
||||
*/
|
||||
public Stream<Image> streamAllImages() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBox = new HashMap<>();
|
||||
for (Page page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@SuperBuilder
|
||||
public class DuplicatedParagraph extends Paragraph {
|
||||
|
||||
TextBlock unsortedLeafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return Stream.of(leafTextBlock, unsortedLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Footer extends AbstractSemanticNode {
|
||||
|
||||
final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty();
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.FOOTER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
return sectionIdentifier;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
public interface GenericSemanticNode extends SemanticNode {
|
||||
|
||||
}
|
||||
@ -0,0 +1,65 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
/**
|
||||
* Represents the header part of a document page.
|
||||
*/
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Header extends AbstractSemanticNode {
|
||||
|
||||
final static SectionIdentifier sectionIdentifier = SectionIdentifier.empty();
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.HEADER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
return sectionIdentifier;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,100 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
|
||||
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
/**
|
||||
* Represents a headline in a document.
|
||||
*/
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Headline extends AbstractSemanticNode {
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
SectionIdentifier sectionIdentifier;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.HEADLINE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
if (sectionIdentifier == null) {
|
||||
sectionIdentifier = SectionIdentifier.fromSearchText(getTextBlock().getSearchText());
|
||||
}
|
||||
return sectionIdentifier;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates an empty headline with no text content.
|
||||
*
|
||||
* @return An empty {@link Headline} instance.
|
||||
*/
|
||||
public static Headline empty() {
|
||||
|
||||
return Headline.builder().leafTextBlock(AtomicTextBlock.empty(-1L, 0, new Page(), -1, null)).build();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this headline is associated with any paragraphs within its parent section or node.
|
||||
*
|
||||
* @return True if there are paragraphs associated with this headline, false otherwise.
|
||||
*/
|
||||
public boolean hasParagraphs() {
|
||||
|
||||
return getParent().streamAllSubNodesOfType(NodeType.PARAGRAPH)
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,140 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
import com.knecon.fforesight.llm.service.document.entity.IEntity;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
/**
|
||||
* Represents an image within the document.
|
||||
*/
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Image extends AbstractSemanticNode implements IEntity {
|
||||
|
||||
String id;
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparent;
|
||||
Rectangle2D position;
|
||||
|
||||
Page page;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.IMAGE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Page> getPages() {
|
||||
|
||||
return Collections.singleton(page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextRange getTextRange() {
|
||||
|
||||
return leafTextBlock.getTextRange();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
return getTextRange().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String type() {
|
||||
|
||||
return getType().toString().toLowerCase(Locale.ENGLISH);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + getValue() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
bBoxPerPage.put(page, position);
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getValue() {
|
||||
|
||||
return NodeType.IMAGE + ":" + camelCase(imageType.toString());
|
||||
}
|
||||
|
||||
|
||||
private String camelCase(String name) {
|
||||
|
||||
return name.charAt(0) + name.substring(1).toLowerCase(Locale.ENGLISH);
|
||||
}
|
||||
|
||||
|
||||
public boolean mostlyContainedBy(Image image, double containmentThreshold) {
|
||||
|
||||
Map<Page, Rectangle2D> bboxImage = image.getBBox();
|
||||
Map<Page, Rectangle2D> bbox = this.getBBox();
|
||||
//image needs to be on the same page
|
||||
if (bboxImage.get(this.page) != null) {
|
||||
Rectangle2D intersection = bboxImage.get(this.page).createIntersection(bbox.get(this.page));
|
||||
double calculatedIntersection = intersection.getWidth() * intersection.getHeight();
|
||||
double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight();
|
||||
return (calculatedIntersection / area) > containmentThreshold;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean mostlyContains(Image image, double containmentThreshold) {
|
||||
|
||||
Map<Page, Rectangle2D> bboxImage = image.getBBox();
|
||||
Map<Page, Rectangle2D> bbox = this.getBBox();
|
||||
Rectangle2D intersection = bbox.get(this.page).createIntersection(bboxImage.get(this.page));
|
||||
double calculatedIntersection = intersection.getWidth() * intersection.getHeight();
|
||||
double area = bbox.get(this.page).getWidth() * bbox.get(this.page).getHeight();
|
||||
return (area / calculatedIntersection) > containmentThreshold;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
OTHER,
|
||||
OCR,
|
||||
GRAPHIC;
|
||||
|
||||
|
||||
public static ImageType fromString(String imageType) {
|
||||
|
||||
return switch (imageType.toLowerCase(Locale.ROOT)) {
|
||||
case "logo" -> ImageType.LOGO;
|
||||
case "formula" -> ImageType.FORMULA;
|
||||
case "signature" -> ImageType.SIGNATURE;
|
||||
case "ocr" -> ImageType.OCR;
|
||||
case "graphic" -> ImageType.GRAPHIC;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public enum NodeType {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
SUPER_SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER;
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ENGLISH);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,69 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
/**
|
||||
* Represents a single page in a document.
|
||||
*/
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Page {
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
Integer number;
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
List<SemanticNode> mainBody;
|
||||
Header header;
|
||||
Footer footer;
|
||||
|
||||
@Builder.Default
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
/**
|
||||
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
|
||||
*
|
||||
* @return The main body text block.
|
||||
*/
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream()
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.valueOf(number);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,54 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
/**
|
||||
* Represents a paragraph in the document.
|
||||
*/
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PROTECTED)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Paragraph extends AbstractSemanticNode {
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.PARAGRAPH;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,90 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
* Represents a section within a document, encapsulating both its textual content and semantic structure.
|
||||
*/
|
||||
@Slf4j
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class Section extends AbstractSemanticNode {
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SECTION;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this section contains any tables.
|
||||
*
|
||||
* @return True if the section contains at least one table, false otherwise.
|
||||
*/
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
return getHeadline().getSectionIdentifier();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||
.map(node -> (Headline) node)//
|
||||
.findFirst()//
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if any headline within this section or its sub-nodes contains a given string.
|
||||
*
|
||||
* @param value The string to search for within headlines, case-sensitive.
|
||||
* @return True if at least one headline contains the specified string, false otherwise.
|
||||
*/
|
||||
public boolean anyHeadlineContainsString(String value) {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
|
||||
*
|
||||
* @param value The string to search for within headlines, case-insensitive.
|
||||
* @return True if at least one headline contains the specified string, false otherwise.
|
||||
*/
|
||||
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,158 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
/**
|
||||
* Represents a unique identifier for a section within a document.
|
||||
*/
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
|
||||
private enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
Format format;
|
||||
String identifierString;
|
||||
List<Integer> identifiers;
|
||||
boolean asChild;
|
||||
|
||||
|
||||
/**
|
||||
* Generates a SectionIdentifier from the headline text of a section, determining its format and structure.
|
||||
*
|
||||
* @param headline The headline text from which to generate the section identifier.
|
||||
* @return A {@link SectionIdentifier} instance corresponding to the headline text.
|
||||
*/
|
||||
public static SectionIdentifier fromSearchText(String headline) {
|
||||
|
||||
if (headline == null || headline.isEmpty() || headline.isBlank()) {
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline);
|
||||
if (numericalIdentifierMatcher.find()) {
|
||||
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
|
||||
}
|
||||
// more formats here
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Marks the current section identifier as a child of another section.
|
||||
*
|
||||
* @param sectionIdentifier The parent section identifier.
|
||||
* @return A new {@link SectionIdentifier} instance marked as a child.
|
||||
*/
|
||||
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generates a SectionIdentifier that represents the entire document.
|
||||
*
|
||||
* @return A {@link SectionIdentifier} with a document-wide scope.
|
||||
*/
|
||||
public static SectionIdentifier document() {
|
||||
|
||||
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generates an empty SectionIdentifier.
|
||||
*
|
||||
* @return An empty {@link SectionIdentifier} instance.
|
||||
*/
|
||||
public static SectionIdentifier empty() {
|
||||
|
||||
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
|
||||
}
|
||||
|
||||
|
||||
private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) {
|
||||
|
||||
String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end());
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
for (int i = 1; i <= 4; i++) {
|
||||
String numericalIdentifier = numericalIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
return new SectionIdentifier(Format.NUMERICAL,
|
||||
identifierString,
|
||||
identifiers.stream()
|
||||
.toList(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the current section is the parent of the given section.
|
||||
*
|
||||
* @param sectionIdentifier The section identifier to compare against.
|
||||
* @return true if the current section is the parent of the given section, false otherwise.
|
||||
*/
|
||||
public boolean isParentOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
if (this.format.equals(Format.EMPTY)) {
|
||||
return false;
|
||||
}
|
||||
if (this.format.equals(Format.DOCUMENT)) {
|
||||
return true;
|
||||
}
|
||||
if (!this.format.equals(sectionIdentifier.format)) {
|
||||
return false;
|
||||
}
|
||||
if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < this.identifiers.size(); i++) {
|
||||
if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the current section is a child of the given section, based on their identifiers.
|
||||
*
|
||||
* @param sectionIdentifier The section identifier to compare against.
|
||||
* @return True if the current section is a child of the given section, false otherwise.
|
||||
*/
|
||||
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {
|
||||
return false;
|
||||
}
|
||||
return sectionIdentifier.isParentOf(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,672 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.DocumentTree;
|
||||
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
/**
|
||||
* Returns the type of this node, such as Section, Paragraph, etc.
|
||||
*
|
||||
* @return NodeType of this node
|
||||
*/
|
||||
NodeType getType();
|
||||
|
||||
|
||||
/**
|
||||
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
|
||||
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
|
||||
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
|
||||
*
|
||||
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
default TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose TextRange intersects the TextRange of this node.
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<TextEntity> getEntities();
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<Page> getPages() {
|
||||
|
||||
return getTextBlock().getPages();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds the first page associated with this Node.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Page getFirstPage() {
|
||||
|
||||
return getTextBlock().getPages()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages for this TextRange, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<Page> getPages(TextRange textRange) {
|
||||
|
||||
if (!getTextRange().intersects(textRange)) {
|
||||
throw new IllegalArgumentException(format("%s which was used to query for pages is not intersected in the %s of this node!", textRange, getTextRange()));
|
||||
}
|
||||
return getTextBlock().getPages(textRange);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the given page number exists in the list of pages.
|
||||
*
|
||||
* @param pageNumber the page number to be checked
|
||||
* @return true if the page number exists, otherwise false
|
||||
*/
|
||||
default boolean onPage(int pageNumber) {
|
||||
|
||||
return getPages().stream()
|
||||
.anyMatch(page -> page.getNumber() == pageNumber);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the DocumentTree Object.
|
||||
*
|
||||
* @return the DocumentTree of the Document this node belongs to
|
||||
*/
|
||||
DocumentTree getDocumentTree();
|
||||
|
||||
|
||||
/**
|
||||
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
|
||||
*
|
||||
* @return the DocumentTree ID
|
||||
*/
|
||||
List<Integer> getTreeId();
|
||||
|
||||
|
||||
/**
|
||||
* This should only be used during graph construction.
|
||||
*
|
||||
* @param tocId List of Integers
|
||||
*/
|
||||
void setTreeId(List<Integer> tocId);
|
||||
|
||||
|
||||
/**
|
||||
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
|
||||
* If no Headline is found this way, it will recursively traverse the tree up and try again until it hits the root, where it will perform a BFS.
|
||||
* If no Headline exists anywhere in the Document a dummy Headline is returned.
|
||||
*
|
||||
* @return First Headline found.
|
||||
*/
|
||||
default Headline getHeadline() {
|
||||
|
||||
return getParent().getHeadline();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a SectionIdentifier, such that it acts as a child of the first Headline associated with this SemanticNode.
|
||||
*
|
||||
* @return The SectionIdentifier from the first Headline.
|
||||
*/
|
||||
default SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
return SectionIdentifier.asChildOf(getHeadline().getSectionIdentifier());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if its TreeId has a length greater than zero.
|
||||
*
|
||||
* @return boolean indicating whether this Node has a Parent in the DocumentTree
|
||||
*/
|
||||
default boolean hasParent() {
|
||||
|
||||
return getDocumentTree().hasParentById(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the DocumentTree
|
||||
* throws NotFoundException, when no parent is present
|
||||
*/
|
||||
default SemanticNode getParent() {
|
||||
|
||||
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode which is directly underneath the document and also under which this node is.
|
||||
* if this is the highest child node or the document itself, it returns itself.
|
||||
*/
|
||||
default SemanticNode getHighestParent() {
|
||||
|
||||
return getDocumentTree().getHighestParentById(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the next sibling node of this SemanticNode in the document tree, if any.
|
||||
* If there is no next sibling node, an empty Optional is returned.
|
||||
*
|
||||
* @return Optional containing the next sibling node, or empty if there is none
|
||||
*/
|
||||
default Optional<SemanticNode> getNextSibling() {
|
||||
|
||||
return getDocumentTree().getNextSibling(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the previous sibling node of this SemanticNode in the document tree, if any.
|
||||
* If there is no previous sibling node, an empty Optional is returned.
|
||||
*
|
||||
* @return Optional containing the previous sibling node, or empty if there is none
|
||||
*/
|
||||
default Optional<SemanticNode> getPreviousSibling() {
|
||||
|
||||
return getDocumentTree().getPreviousSibling(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections, Images, and Tables are not leaves.
|
||||
* A TableCell might be a leaf depending on its area compared to the page.
|
||||
*
|
||||
* @return boolean, indicating if a Node has direct access to a TextBlock
|
||||
*/
|
||||
default boolean isLeaf() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections and Tables are no leaves.
|
||||
*
|
||||
* @return AtomicTextBlock
|
||||
*/
|
||||
default TextBlock getLeafTextBlock() {
|
||||
|
||||
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
|
||||
*
|
||||
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
|
||||
*/
|
||||
default void setLeafTextBlock(TextBlock textBlock) {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
|
||||
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
|
||||
*
|
||||
* @return Integer representing the number on the page
|
||||
*/
|
||||
default Integer getNumberOnPage() {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
||||
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the SemanticNode contains any text.
|
||||
*
|
||||
* @return true, if this node's TextBlock is not empty
|
||||
*/
|
||||
default boolean hasText() {
|
||||
|
||||
return !getTextBlock().isEmpty();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains the provided String.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string
|
||||
*/
|
||||
default boolean containsString(String string) {
|
||||
|
||||
return getTextBlock().getSearchText().contains(string);
|
||||
}
|
||||
|
||||
|
||||
Set<LayoutEngine> getEngines();
|
||||
|
||||
|
||||
default void addEngine(LayoutEngine engine) {
|
||||
|
||||
getEngines().add(engine);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains all strings
|
||||
*/
|
||||
default boolean containsAllStrings(String... strings) {
|
||||
|
||||
return Arrays.stream(strings)
|
||||
.allMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings to check if they are contained in the TextBlock
|
||||
* @return true, if this node's TextBlock contains any of the provided strings
|
||||
*/
|
||||
default boolean containsAnyString(String... strings) {
|
||||
|
||||
return Arrays.stream(strings)
|
||||
.anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream()
|
||||
.anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings case-insensitive.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string case-insensitive
|
||||
*/
|
||||
default boolean containsStringIgnoreCase(String string) {
|
||||
|
||||
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyStringIgnoreCase(String... strings) {
|
||||
|
||||
return Arrays.stream(strings)
|
||||
.anyMatch(this::containsStringIgnoreCase);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings case-insensitive.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAllStringsIgnoreCase(String... strings) {
|
||||
|
||||
return Arrays.stream(strings)
|
||||
.allMatch(this::containsStringIgnoreCase);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains exactly the provided String as a word.
|
||||
*
|
||||
* @param word - String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains string
|
||||
*/
|
||||
default boolean containsWord(String word) {
|
||||
|
||||
return getTextBlock().getWords()
|
||||
.stream()
|
||||
.anyMatch(s -> s.equals(word));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains exactly the provided String as a word case-insensitive.
|
||||
*
|
||||
* @param word - String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains string
|
||||
*/
|
||||
default boolean containsWordIgnoreCase(String word) {
|
||||
|
||||
return getTextBlock().getWords()
|
||||
.stream()
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(s -> s.equals(word.toLowerCase(Locale.ENGLISH)));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings as a word.
|
||||
*
|
||||
* @param words - A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the provided strings
|
||||
*/
|
||||
default boolean containsAnyWord(String... words) {
|
||||
|
||||
return Arrays.stream(words)
|
||||
.anyMatch(word -> getTextBlock().getWords()
|
||||
.stream()
|
||||
.anyMatch(word::equals));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings as a word case-insensitive.
|
||||
*
|
||||
* @param words - A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the provided strings
|
||||
*/
|
||||
default boolean containsAnyWordIgnoreCase(String... words) {
|
||||
|
||||
return Arrays.stream(words)
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(word -> getTextBlock().getWords()
|
||||
.stream()
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(word::equals));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings as word.
|
||||
*
|
||||
* @param words - A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains all the provided strings
|
||||
*/
|
||||
default boolean containsAllWords(String... words) {
|
||||
|
||||
return Arrays.stream(words)
|
||||
.allMatch(word -> getTextBlock().getWords()
|
||||
.stream()
|
||||
.anyMatch(word::equals));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings as word case-insensitive.
|
||||
*
|
||||
* @param words - A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains all the provided strings
|
||||
*/
|
||||
default boolean containsAllWordsIgnoreCase(String... words) {
|
||||
|
||||
return Arrays.stream(words)
|
||||
.map(String::toLowerCase)
|
||||
.allMatch(word -> getTextBlock().getWords()
|
||||
.stream()
|
||||
.map(String::toLowerCase)
|
||||
.anyMatch(word::equals));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode intersects the provided rectangle.
|
||||
*
|
||||
* @param x the lower left corner X value
|
||||
* @param y the lower left corner Y value
|
||||
* @param w width
|
||||
* @param h height
|
||||
* @param pageNumber the pageNumber of the rectangle
|
||||
* @return true if intersects, false otherwise
|
||||
*/
|
||||
default boolean intersectsRectangle(int x, int y, int w, int h, int pageNumber) {
|
||||
|
||||
return getBBox().entrySet()
|
||||
.stream()
|
||||
.filter(entry -> entry.getKey().getNumber() == pageNumber)
|
||||
.map(Map.Entry::getValue)
|
||||
.anyMatch(rect -> rect.intersects(x, y, w, h));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the TextRange of the RedactionEntity intersects or even contains the RedactionEntity.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
*
|
||||
* @param textEntity RedactionEntity, which is being inserted into the graph
|
||||
*/
|
||||
default void addThisToEntityIfIntersects(TextEntity textEntity) {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (textBlock.getTextRange().intersects(textEntity.getTextRange())) {
|
||||
if (textBlock.containsTextRange(textEntity.getTextRange())) {
|
||||
textEntity.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
textEntity.addIntersectingNode(this);
|
||||
getDocumentTree().findIntersectingChildNodes(getTreeId(), textEntity.getTextRange())
|
||||
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildren() {
|
||||
|
||||
return getDocumentTree().childNodes(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
|
||||
|
||||
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||
.filter(entry -> entry.getType().equals(nodeType))
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The TextRange is the start and end string offsets in the reading order of the document.
|
||||
*
|
||||
* @return TextRange of this Node's TextBlock
|
||||
*/
|
||||
default TextRange getTextRange() {
|
||||
|
||||
return getTextBlock().getTextRange();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the length of the text content in this Node's TextBlock.
|
||||
*
|
||||
* @return The length of the text content
|
||||
*/
|
||||
default int length() {
|
||||
|
||||
return getTextRange().length();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For a given TextRange this function returns a List of rectangle around the text in the range.
|
||||
* These Rectangles are split either by a new line or by a large gap in the current line.
|
||||
* This is mainly used to find the positions of TextEntities
|
||||
*
|
||||
* @param textRange A TextRange to calculate the positions for.
|
||||
* @return A Map, where the keys are the pages and the values are a list of rectangles describing the position of words
|
||||
*/
|
||||
default Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange textRange) {
|
||||
|
||||
if (isLeaf()) {
|
||||
return getTextBlock().getPositionsPerPage(textRange);
|
||||
}
|
||||
Optional<SemanticNode> containingChildNode = getDocumentTree().findFirstContainingChild(getTreeId(), textRange);
|
||||
if (containingChildNode.isEmpty()) {
|
||||
return getTextBlock().getPositionsPerPage(textRange);
|
||||
}
|
||||
return containingChildNode.get().getPositionsPerPage(textRange);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
|
||||
* If called on the Document, it will return the cropbox of each page
|
||||
*
|
||||
* @return Rectangle2D fully encapsulating this Node for each page.
|
||||
*/
|
||||
default Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (isLeaf()) {
|
||||
return getBBoxFromLeafTextBlock();
|
||||
}
|
||||
|
||||
return getBBoxFromChildren();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
|
||||
*
|
||||
* @param rectangle2D The rectangle to check if it is contained
|
||||
* @param pageNumber The Page number on which the rectangle should be checked
|
||||
* @return boolean
|
||||
*/
|
||||
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
|
||||
|
||||
Page helperPage = Page.builder().number(pageNumber).build();
|
||||
if (!getPages().contains(helperPage)) {
|
||||
return false;
|
||||
}
|
||||
return getBBox().get(helperPage).contains(rectangle2D);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO: this produces unwanted results for sections spanning multiple columns.
|
||||
* Computes the Union of the bounding boxes of all children recursively.
|
||||
*
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox)
|
||||
.toList();
|
||||
Set<Page> pages = childrenBBoxes.stream()
|
||||
.flatMap(map -> map.keySet()
|
||||
.stream())
|
||||
.collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
bBoxPerPage.put(page, bBoxOnPage);
|
||||
}
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The union of all BoundingBoxes of the TextBlock of this node
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atomicTextBlocks) -> bBoxPerPage.put(page, RectangleTransformations.atomicTextBlockBBox(atomicTextBlocks)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,89 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
* Represents a section within a document, encapsulating both its textual content and semantic structure.
|
||||
*/
|
||||
@Slf4j
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class SuperSection extends AbstractSemanticNode {
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SUPER_SECTION;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this section contains any tables.
|
||||
*
|
||||
* @return True if the section contains at least one table, false otherwise.
|
||||
*/
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionIdentifier getSectionIdentifier() {
|
||||
|
||||
return getHeadline().getSectionIdentifier();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||
.map(node -> (Headline) node)//
|
||||
.findFirst()//
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if any headline within this section or its sub-nodes contains a given string.
|
||||
*
|
||||
* @param value The string to search for within headlines, case-sensitive.
|
||||
* @return True if at least one headline contains the specified string, false otherwise.
|
||||
*/
|
||||
public boolean anyHeadlineContainsString(String value) {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsString(value));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if any headline within this section or its sub-nodes contains a given string, case-insensitive.
|
||||
*
|
||||
* @param value The string to search for within headlines, case-insensitive.
|
||||
* @return True if at least one headline contains the specified string, false otherwise.
|
||||
*/
|
||||
public boolean anyHeadlineContainsStringIgnoreCase(String value) {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).anyMatch(h -> h.containsStringIgnoreCase(value));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,306 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.DocumentTree;
|
||||
import com.knecon.fforesight.llm.service.document.entity.TextEntity;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
/**
|
||||
* Represents a table within a document.
|
||||
*/
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Table implements SemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||
@EqualsAndHashCode.Include
|
||||
List<Integer> treeId;
|
||||
DocumentTree documentTree;
|
||||
|
||||
int numberOfRows;
|
||||
int numberOfCols;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
if (bBoxCache == null) {
|
||||
bBoxCache = SemanticNode.super.getBBox();
|
||||
}
|
||||
return bBoxCache;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||
*
|
||||
* @param strings Strings to check whether a row contains them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
|
||||
*/
|
||||
public Stream<TextEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
|
||||
|
||||
return IntStream.range(0, numberOfRows).boxed()
|
||||
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
|
||||
.flatMap(this::streamRow)
|
||||
.map(TableCell::getEntities)
|
||||
.flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether the specified row contains all the provided strings.
|
||||
*
|
||||
* @param row the row to check as an Integer, must be smaller than numberOfRows
|
||||
* @param strings a list of strings to check for
|
||||
* @return true, if all strings appear in the provided row
|
||||
*/
|
||||
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
|
||||
|
||||
String rowText = streamRow(row).map(TableCell::getTextBlock)
|
||||
.collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
|
||||
return strings.stream()
|
||||
.map(String::toLowerCase)
|
||||
.allMatch(rowText::contains);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param value the string which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*/
|
||||
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
|
||||
|
||||
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
|
||||
.map(TableCell::getCol)
|
||||
.toList();
|
||||
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
|
||||
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value)))
|
||||
.map(TableCell::getEntities)
|
||||
.flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param values the strings which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*/
|
||||
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
|
||||
|
||||
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
|
||||
.map(TableCell::getCol)
|
||||
.toList();
|
||||
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
|
||||
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values)))
|
||||
.map(TableCell::getEntities)
|
||||
.flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a TableCell at the provided row and column location.
|
||||
*
|
||||
* @param row int representing the row, must be smaller than numberOfRows
|
||||
* @param col int representing the col, must be smaller than numberOfCols
|
||||
* @return TableCell at the provided location in the table
|
||||
*/
|
||||
public TableCell getCell(int row, int col) {
|
||||
|
||||
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
|
||||
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
|
||||
}
|
||||
int idx = row * numberOfCols + col;
|
||||
return (TableCell) documentTree.getEntryById(treeId).getChildren().get(idx).getNode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells in this Table row-wise.
|
||||
*
|
||||
* @return Stream of all TableCells
|
||||
*/
|
||||
public Stream<TableCell> streamTableCells() {
|
||||
|
||||
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells in this Table which have the provided header row-wise.
|
||||
*
|
||||
* @return Stream of all TableCells which have the provided header
|
||||
*/
|
||||
public Stream<TableCell> streamTableCellsWithHeader(String header) {
|
||||
|
||||
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
|
||||
.map(TableCell::getCol)
|
||||
.flatMap(this::streamCol)
|
||||
.filter(tableCellNode -> !tableCellNode.isHeader());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells belonging to the provided column from top down.
|
||||
*
|
||||
* @param col int representing the column
|
||||
* @return Stream of all TableCell in the provided column
|
||||
*/
|
||||
public Stream<TableCell> streamCol(int col) {
|
||||
|
||||
return IntStream.range(0, numberOfRows).boxed()
|
||||
.map(row -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells belonging to the provided row from left to right.
|
||||
*
|
||||
* @param row int representing the row
|
||||
* @return Stream of all TableCell in the provided row
|
||||
*/
|
||||
public Stream<TableCell> streamRow(int row) {
|
||||
|
||||
return IntStream.range(0, numberOfCols).boxed()
|
||||
.map(col -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells row-wise and filters them with header == true.
|
||||
*
|
||||
* @return Stream of all TableCells with header == true
|
||||
*/
|
||||
public Stream<TableCell> streamHeaders() {
|
||||
|
||||
return streamTableCells().filter(TableCell::isHeader);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells of the provided row and column and filters them with header == true.
|
||||
*
|
||||
* @param row int representing the row
|
||||
* @param col int representing the column
|
||||
* @return Stream of all TableCells with header == true in the provided row or col
|
||||
*/
|
||||
public Stream<TableCell> streamHeadersForCell(int row, int col) {
|
||||
|
||||
return Stream.concat(streamRow(row), streamCol(col))
|
||||
.filter(TableCell::isHeader);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all Headers and checks if any equal the provided string.
|
||||
*
|
||||
* @param header string to check the headers for
|
||||
* @return true, if at least one header equals the provided string
|
||||
*/
|
||||
public boolean hasHeader(String header) {
|
||||
|
||||
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all Headers and checks if any equal the provided string.
|
||||
*
|
||||
* @param header string to check the headers for
|
||||
* @return true, if at least one header equals the provided string
|
||||
*/
|
||||
public boolean hasHeaderIgnoreCase(String header) {
|
||||
|
||||
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock()
|
||||
.getSearchText()
|
||||
.strip()
|
||||
.toLowerCase(Locale.ENGLISH)
|
||||
.equals(header.toLowerCase(Locale.ENGLISH)));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
|
||||
*
|
||||
* @param header string to find header cells
|
||||
* @param value string to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
|
||||
*/
|
||||
public boolean hasRowWithHeaderAndValue(String header, String value) {
|
||||
|
||||
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
|
||||
*
|
||||
* @param header string to find header cells
|
||||
* @param values List of strings to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
|
||||
*/
|
||||
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
|
||||
|
||||
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.TABLE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = SemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,84 @@
|
||||
package com.knecon.fforesight.llm.service.document.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlock;
|
||||
import com.knecon.fforesight.llm.service.document.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
/**
|
||||
* Represents a single table cell within a table.
|
||||
*/
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = true)
|
||||
public class TableCell extends AbstractSemanticNode {
|
||||
|
||||
int row;
|
||||
int col;
|
||||
boolean header;
|
||||
|
||||
Rectangle2D bBox;
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.TABLE_CELL;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (isLeaf()) {
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,257 @@
|
||||
package com.knecon.fforesight.llm.service.document.textblock;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
Long id;
|
||||
Integer numberOnPage;
|
||||
Page page;
|
||||
|
||||
//string coordinates
|
||||
TextRange textRange;
|
||||
String searchText;
|
||||
List<String> words;
|
||||
List<Integer> lineBreaks;
|
||||
|
||||
//position coordinates
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
@Getter
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
SemanticNode parent;
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return lineBreaks.size() + 1;
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(textBlockIdx)
|
||||
.textRange(new TextRange(stringOffset, stringOffset))
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.page(page)
|
||||
.numberOnPage(numberOnPage)
|
||||
.stringIdxToPositionIdx(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData atomicTextBlockData, DocumentPositionData atomicPositionBlockData, SemanticNode parent, Page page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(atomicTextBlockData.getId())
|
||||
.numberOnPage(atomicTextBlockData.getNumberOnPage())
|
||||
.page(page)
|
||||
.textRange(new TextRange(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
|
||||
.searchText(atomicTextBlockData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(atomicTextBlockData.getLineBreaks()).boxed()
|
||||
.toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(atomicPositionBlockData.getStringIdxToPositionIdx()).boxed()
|
||||
.toList())
|
||||
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions)
|
||||
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public TextRange getLineTextRange(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
return new TextRange(textRange.start(), textRange.start());
|
||||
}
|
||||
if (numberOfLines() == 1) {
|
||||
return textRange;
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return new TextRange(textRange.start(), lineBreaks.get(0) + textRange.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return new TextRange(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
|
||||
}
|
||||
return new TextRange(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
|
||||
}
|
||||
|
||||
|
||||
public List<String> getWords() {
|
||||
|
||||
if (words == null) {
|
||||
words = new ArrayList<>();
|
||||
BreakIterator iterator = BreakIterator.getWordInstance(Locale.ENGLISH);
|
||||
iterator.setText(searchText);
|
||||
int start = iterator.first();
|
||||
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
|
||||
words.add(searchText.substring(start, end));
|
||||
}
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<AtomicTextBlock> getAtomicTextBlocks() {
|
||||
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + textRange.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + textRange.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||
|
||||
if (!containsTextRange(stringTextRange)) {
|
||||
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
|
||||
}
|
||||
if (stringTextRange.length() == 0) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
|
||||
|
||||
if (stringTextRange.end() == this.textRange.end()) {
|
||||
return positions.subList(startPositionIdx, positions.size());
|
||||
}
|
||||
|
||||
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||
|
||||
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
|
||||
rectanglePerLinePerPage.put(page, rectanglesPerLine);
|
||||
return rectanglePerLinePerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String subSequenceWithLineBreaks(TextRange textRange) {
|
||||
|
||||
if (textRange.length() == 0 || !getTextRange().contains(textRange)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
Set<Integer> lbInBoundary = lineBreaks.stream()
|
||||
.map(i -> i + textRange.start())
|
||||
.filter(textRange::contains)
|
||||
.collect(Collectors.toSet());
|
||||
if (textRange.end() == getTextRange().end()) {
|
||||
lbInBoundary.add(getTextRange().end());
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = textRange.start(); i < textRange.end(); i++) {
|
||||
char character = this.charAt(i);
|
||||
if (lbInBoundary.contains(i + 1)) {
|
||||
// always plus one, due to the linebreaks being an exclusive end index
|
||||
if (!Character.isWhitespace(character)) {
|
||||
lbInBoundary.remove(i + 1);
|
||||
lbInBoundary.add(i + 2);
|
||||
sb.append(character);
|
||||
continue;
|
||||
}
|
||||
sb.append("\n");
|
||||
} else {
|
||||
sb.append(character);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
||||
|
||||
return getLineBreaks().stream()
|
||||
.map(linebreak -> linebreak + this.textRange.start())
|
||||
.filter(textRange::contains)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return searchText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,268 @@
|
||||
package com.knecon.fforesight.llm.service.document.textblock;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
List<AtomicTextBlock> atomicTextBlocks;
|
||||
String searchText;
|
||||
TextRange textRange;
|
||||
|
||||
|
||||
public static ConcatenatedTextBlock empty() {
|
||||
|
||||
return new ConcatenatedTextBlock(Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
this.atomicTextBlocks = new LinkedList<>();
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
textRange = new TextRange(-1, -1);
|
||||
return;
|
||||
}
|
||||
var firstTextBlock = atomicTextBlocks.get(0);
|
||||
this.atomicTextBlocks.add(firstTextBlock);
|
||||
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
|
||||
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size())
|
||||
.forEach(this::concat);
|
||||
}
|
||||
|
||||
|
||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||
|
||||
if (this.atomicTextBlocks.isEmpty()) {
|
||||
textRange.setStart(textBlock.getTextRange().start());
|
||||
textRange.setEnd(textBlock.getTextRange().end());
|
||||
} else if (textRange.end() != textBlock.getTextRange().start()) {
|
||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
|
||||
}
|
||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||
textRange.setEnd(textBlock.getTextRange().end());
|
||||
this.searchText = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream()
|
||||
.filter(textBlock -> textBlock.getTextRange().contains(stringIdx))
|
||||
.findAny()
|
||||
.orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
|
||||
|
||||
List<AtomicTextBlock> intersectingAtomicTextBlocks = new LinkedList<>();
|
||||
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
|
||||
if (atomicTextBlock.getTextRange().start() > textRange.end()) {
|
||||
break; // early stop, following TextBlocks will never intersect
|
||||
}
|
||||
if (atomicTextBlock.getTextRange().intersects(textRange)) {
|
||||
intersectingAtomicTextBlocks.add(atomicTextBlock);
|
||||
}
|
||||
}
|
||||
return intersectingAtomicTextBlocks;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getSearchText() {
|
||||
|
||||
if (searchText == null) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
|
||||
searchText = sb.toString();
|
||||
}
|
||||
return searchText;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<String> getWords() {
|
||||
|
||||
return atomicTextBlocks.stream()
|
||||
.map(AtomicTextBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return atomicTextBlocks.stream()
|
||||
.mapToInt(AtomicTextBlock::numberOfLines).sum();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getLineBreaks() {
|
||||
|
||||
return getAtomicTextBlocks().stream()
|
||||
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
|
||||
.stream())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
|
||||
}
|
||||
|
||||
|
||||
public TextRange getLineTextRange(int lineNumber) {
|
||||
|
||||
if (atomicTextBlocks.size() == 1) {
|
||||
return atomicTextBlocks.get(0).getLineTextRange(lineNumber);
|
||||
}
|
||||
int lineNumberInCurrentBlock = lineNumber;
|
||||
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
|
||||
if (lineNumberInCurrentBlock < atomicTextBlock.numberOfLines()) {
|
||||
return atomicTextBlock.getLineTextRange(lineNumberInCurrentBlock);
|
||||
}
|
||||
lineNumberInCurrentBlock -= atomicTextBlock.numberOfLines();
|
||||
}
|
||||
return new TextRange(textRange.start(), textRange.start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||
|
||||
if (textBlocks.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositions(stringTextRange);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getPositions());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||
|
||||
if (textBlocks.isEmpty()) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
|
||||
}
|
||||
|
||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
||||
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
|
||||
stringTextRange.end())));
|
||||
|
||||
return rectanglesPerLinePerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String subSequenceWithLineBreaks(TextRange textRange) {
|
||||
|
||||
if (textRange.length() == 0 || !getTextRange().contains(textRange)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(textRange);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).subSequenceWithLineBreaks(textRange);
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(textRange.start(), firstTextBlock.getTextRange().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
sb.append(textBlock.searchTextWithLineBreaks());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), textRange.end())));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
||||
|
||||
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
||||
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
|
||||
rectangles,
|
||||
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
|
||||
.toList()));
|
||||
return mergedMap;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getSearchText();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,176 @@
|
||||
package com.knecon.fforesight.llm.service.document.textblock;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.llm.service.document.RectangleTransformations;
|
||||
import com.knecon.fforesight.llm.service.document.TextRange;
|
||||
import com.knecon.fforesight.llm.service.document.nodes.Page;
|
||||
|
||||
public interface TextBlock extends CharSequence {
|
||||
|
||||
String getSearchText();
|
||||
|
||||
|
||||
List<String> getWords();
|
||||
|
||||
|
||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||
|
||||
|
||||
TextRange getTextRange();
|
||||
|
||||
|
||||
int getNextLinebreak(int fromIndex);
|
||||
|
||||
|
||||
int getPreviousLinebreak(int fromIndex);
|
||||
|
||||
|
||||
TextRange getLineTextRange(int lineNumber);
|
||||
|
||||
|
||||
List<Integer> getLineBreaks();
|
||||
|
||||
|
||||
Rectangle2D getPosition(int stringIdx);
|
||||
|
||||
|
||||
List<Rectangle2D> getPositions(TextRange stringTextRange);
|
||||
|
||||
|
||||
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
|
||||
|
||||
|
||||
String subSequenceWithLineBreaks(TextRange textRange);
|
||||
|
||||
|
||||
int numberOfLines();
|
||||
|
||||
|
||||
default CharSequence getLine(int lineNumber) {
|
||||
|
||||
return subSequence(getLineTextRange(lineNumber));
|
||||
}
|
||||
|
||||
|
||||
default List<Rectangle2D> getLinePositions(int lineNumber) {
|
||||
|
||||
return getPositions(getLineTextRange(lineNumber));
|
||||
}
|
||||
|
||||
|
||||
default Rectangle2D getLineBBox(int lineNumber) {
|
||||
|
||||
return RectangleTransformations.rectangle2DBBox(getLinePositions(lineNumber));
|
||||
}
|
||||
|
||||
|
||||
default String searchTextWithLineBreaks() {
|
||||
|
||||
return subSequenceWithLineBreaks(getTextRange());
|
||||
}
|
||||
|
||||
|
||||
default int indexOf(String searchTerm) {
|
||||
|
||||
return indexOf(searchTerm, getTextRange().start());
|
||||
}
|
||||
|
||||
|
||||
default Set<Page> getPages() {
|
||||
|
||||
return getAtomicTextBlocks().stream()
|
||||
.map(AtomicTextBlock::getPage)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
default Set<Page> getPages(TextRange textRange) {
|
||||
|
||||
return getAtomicTextBlocks().stream()
|
||||
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
|
||||
.map(AtomicTextBlock::getPage)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
default int indexOf(String searchTerm, int startOffset) {
|
||||
|
||||
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
|
||||
if (start == -1) {
|
||||
return -1;
|
||||
}
|
||||
return start + getTextRange().start();
|
||||
}
|
||||
|
||||
|
||||
default CharSequence getFirstLine() {
|
||||
|
||||
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
|
||||
}
|
||||
|
||||
|
||||
default boolean containsTextRange(TextRange textRange) {
|
||||
|
||||
if (textRange.end() < textRange.start()) {
|
||||
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
|
||||
}
|
||||
return getTextRange().contains(textRange);
|
||||
}
|
||||
|
||||
|
||||
default boolean containsIndex(int stringIndex) {
|
||||
|
||||
return getTextRange().contains(stringIndex);
|
||||
}
|
||||
|
||||
|
||||
default CharSequence subSequence(TextRange textRange) {
|
||||
|
||||
return subSequence(textRange.start(), textRange.end());
|
||||
}
|
||||
|
||||
|
||||
default String buildSummary() {
|
||||
|
||||
String searchText = getSearchText();
|
||||
// substring, as splitting very large strings gets expensive
|
||||
searchText = searchText.substring(0, Math.min(searchText.length(), 200));
|
||||
|
||||
String[] words = searchText.split(" ");
|
||||
int bound = Math.min(words.length, 4);
|
||||
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
|
||||
|
||||
return String.join(" ", list);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default CharSequence subSequence(int start, int end) {
|
||||
|
||||
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default int length() {
|
||||
|
||||
return getTextRange().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default char charAt(int index) {
|
||||
|
||||
return getSearchText().charAt(index - getTextRange().start());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
package com.knecon.fforesight.llm.service.document.textblock;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@NoArgsConstructor
|
||||
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
|
||||
|
||||
@Override
|
||||
public Supplier<ConcatenatedTextBlock> supplier() {
|
||||
|
||||
return ConcatenatedTextBlock::empty;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<ConcatenatedTextBlock> combiner() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
|
||||
|
||||
return a -> a;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.knecon.fforesight.llm.service.services;
|
||||
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.ai.openai.OpenAIAsyncClient;
|
||||
import com.azure.ai.openai.OpenAIClient;
|
||||
import com.azure.ai.openai.OpenAIClientBuilder;
|
||||
import com.azure.ai.openai.models.ChatCompletions;
|
||||
import com.azure.ai.openai.models.ChatCompletionsOptions;
|
||||
import com.azure.core.credential.AzureKeyCredential;
|
||||
import com.knecon.fforesight.llm.service.LlmServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import reactor.core.publisher.Flux;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class LlmRessource {
|
||||
|
||||
OpenAIAsyncClient asyncClient;
|
||||
OpenAIClient client;
|
||||
LlmServiceSettings settings;
|
||||
BlockingQueue<ChatCompletionsOptions> concurrencyCounter;
|
||||
|
||||
|
||||
public LlmRessource(@Value("${llm-service.azureOpenAiEndpoint}") String azureEndpoint, @Value("${llm-service.azureOpenAiKey}") String azureKey, LlmServiceSettings settings) {
|
||||
|
||||
this.settings = settings;
|
||||
this.concurrencyCounter = new ArrayBlockingQueue<>(settings.getConcurrency());
|
||||
this.asyncClient = new OpenAIClientBuilder().credential(new AzureKeyCredential(azureKey)).endpoint(azureEndpoint).buildAsyncClient();
|
||||
this.client = new OpenAIClientBuilder().credential(new AzureKeyCredential(azureKey)).endpoint(azureEndpoint).buildClient();
|
||||
}
|
||||
|
||||
|
||||
public Flux<ChatCompletions> getChatCompletionsFlux(ChatCompletionsOptions options) {
|
||||
|
||||
options.setStream(true);
|
||||
return asyncClient.getChatCompletionsStream(settings.getModel(), options);
|
||||
}
|
||||
|
||||
|
||||
public ChatCompletions getChatCompletions(ChatCompletionsOptions options) throws InterruptedException {
|
||||
|
||||
concurrencyCounter.put(options);
|
||||
ChatCompletions chatCompletions = client.getChatCompletions(settings.getModel(), options);
|
||||
concurrencyCounter.remove(options);
|
||||
return chatCompletions;
|
||||
}
|
||||
|
||||
|
||||
public int getCurrentConcurrency() {
|
||||
|
||||
return concurrencyCounter.size();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.llm.service.services;
|
||||
|
||||
public interface WebSocketMessagingTemplate {
|
||||
|
||||
void sendEvent(String userId, String token, Object payload);
|
||||
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.llm.service.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class FormattingUtils {
|
||||
|
||||
public String humanizeDuration(long duration) {
|
||||
|
||||
if (duration < 1000) {
|
||||
return duration + " ms";
|
||||
} else if (duration < 60 * 1000) {
|
||||
double seconds = duration / 1000.0;
|
||||
return String.format("%.1f s", seconds);
|
||||
} else if (duration < 60 * 60 * 1000) {
|
||||
long minutes = duration / (60 * 1000);
|
||||
long remainingMillis = duration % (60 * 1000);
|
||||
double seconds = remainingMillis / 1000.0;
|
||||
return String.format("%d:%.1f m", minutes, seconds);
|
||||
} else {
|
||||
long hours = duration / (60 * 60 * 1000);
|
||||
long remainingMillis = duration % (60 * 60 * 1000);
|
||||
long minutes = remainingMillis / (60 * 1000);
|
||||
remainingMillis = remainingMillis % (60 * 1000);
|
||||
double seconds = remainingMillis / 1000.0;
|
||||
return String.format("%d:%d:%.1f h", hours, minutes, seconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
69
llm-service/llm-service-server/build.gradle.kts
Normal file
69
llm-service/llm-service-server/build.gradle.kts
Normal file
@ -0,0 +1,69 @@
|
||||
import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
|
||||
|
||||
plugins {
|
||||
application
|
||||
id("com.knecon.fforesight.service.java-conventions")
|
||||
id("org.springframework.boot") version "3.2.3"
|
||||
id("io.spring.dependency-management") version "1.1.3"
|
||||
id("org.sonarqube") version "4.3.0.3225"
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
|
||||
exclude(group = "commons-logging", module = "commons-logging")
|
||||
}
|
||||
}
|
||||
|
||||
val springBootVersion = "3.1.1"
|
||||
val springCloudVersion = "2022.0.5"
|
||||
val springSecurityVersion = "6.1.3"
|
||||
val testcontainersVersion = "1.20.0"
|
||||
|
||||
dependencies {
|
||||
implementation(project(":llm-service-api"))
|
||||
implementation(project(":llm-service-processor"))
|
||||
|
||||
implementation("org.springframework.boot:spring-boot-starter-actuator:$springBootVersion")
|
||||
implementation("org.springframework.boot:spring-boot-starter-amqp:$springBootVersion")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:$springBootVersion")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.3")
|
||||
implementation("org.springframework.boot:spring-boot-starter-websocket:$springBootVersion")
|
||||
implementation("org.springframework.security:spring-security-messaging:$springSecurityVersion")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.49.0")
|
||||
implementation("com.knecon.fforesight:keycloak-commons:0.29.0")
|
||||
implementation("com.knecon.fforesight:swagger-commons:0.7.0")
|
||||
implementation("ch.qos.logback:logback-classic")
|
||||
|
||||
developmentOnly("org.springframework.boot:spring-boot-devtools:$springBootVersion")
|
||||
annotationProcessor("org.springframework.boot:spring-boot-configuration-processor:$springBootVersion")
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-test:$springBootVersion")
|
||||
testImplementation("org.springframework.amqp:spring-rabbit-test:$springBootVersion")
|
||||
}
|
||||
|
||||
|
||||
tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
|
||||
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
||||
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
||||
|
||||
imageName.set("nexus.knecon.com:5001/ff/${project.name}:${project.version}")
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
network.set("host")
|
||||
}
|
||||
docker {
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
bindHostToBuilder.set(true)
|
||||
}
|
||||
verboseLogging.set(true)
|
||||
|
||||
publishRegistry {
|
||||
username.set(providers.gradleProperty("mavenUser").getOrNull())
|
||||
password.set(providers.gradleProperty("mavenPassword").getOrNull())
|
||||
email.set(providers.gradleProperty("mavenEmail").getOrNull())
|
||||
url.set("https://nexus.knecon.com:5001/")
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -17,7 +17,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
@EnableWebMvc
|
||||
@EnableAsync
|
||||
@Import({StorageAutoConfiguration.class})
|
||||
@Import({StorageAutoConfiguration.class, LlmServiceConfiguration.class})
|
||||
@ImportAutoConfiguration({StorageAutoConfiguration.class, MultiTenancyAutoConfiguration.class, SpringDocAutoConfiguration.class, DefaultKeyCloakCommonsAutoConfiguration.class})
|
||||
@SpringBootApplication
|
||||
public class Application {
|
||||
@ -5,7 +5,7 @@ import org.springframework.web.bind.annotation.ExceptionHandler;
|
||||
import org.springframework.web.bind.annotation.RestControllerAdvice;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
import com.knecon.fforesight.llm.service.api.ErrorMessage;
|
||||
import com.knecon.fforesight.llm.service.ErrorMessage;
|
||||
|
||||
@RestControllerAdvice
|
||||
public class ControllerAdvice {
|
||||
@ -6,7 +6,7 @@ import org.springframework.messaging.handler.annotation.MessageMapping;
|
||||
import org.springframework.messaging.handler.annotation.Payload;
|
||||
import org.springframework.stereotype.Controller;
|
||||
|
||||
import com.knecon.fforesight.llm.service.api.model.PromptList;
|
||||
import com.knecon.fforesight.llm.service.PromptList;
|
||||
import com.knecon.fforesight.llm.service.services.LlmService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -0,0 +1,47 @@
|
||||
package com.knecon.fforesight.llm.service.queue;
|
||||
|
||||
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_QUEUE;
|
||||
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_RESPONSE_QUEUE;
|
||||
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.llm.service.LlmNerMessage;
|
||||
import com.knecon.fforesight.llm.service.LlmNerResponseMessage;
|
||||
import com.knecon.fforesight.llm.service.services.LlmNerService;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class MessageHandler {
|
||||
|
||||
LlmNerService llmNerService;
|
||||
|
||||
RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = LLM_NER_SERVICE_QUEUE)
|
||||
public void receiveNerRequest(LlmNerMessage message) {
|
||||
|
||||
LlmNerService.Usage usage = llmNerService.runNer(message);
|
||||
|
||||
LlmNerResponseMessage llmNerResponseMessage = new LlmNerResponseMessage(message.getIdentifier(),
|
||||
usage.completionTokenCount(),
|
||||
usage.promptTokenCount(),
|
||||
Math.toIntExact(usage.durationMillis()));
|
||||
|
||||
rabbitTemplate.convertAndSend(LLM_NER_SERVICE_RESPONSE_QUEUE, llmNerResponseMessage);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,41 @@
|
||||
package com.knecon.fforesight.llm.service.queue;
|
||||
|
||||
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_DLQ;
|
||||
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_QUEUE;
|
||||
import static com.knecon.fforesight.llm.service.QueueNames.LLM_NER_SERVICE_RESPONSE_QUEUE;
|
||||
|
||||
import org.springframework.amqp.core.Queue;
|
||||
import org.springframework.amqp.core.QueueBuilder;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Configuration
|
||||
@RequiredArgsConstructor
|
||||
public class MessagingConfiguration {
|
||||
|
||||
@Bean
|
||||
public Queue llmNerRequestQueue() {
|
||||
|
||||
return QueueBuilder.durable(LLM_NER_SERVICE_QUEUE).withArgument("x-dead-letter-exchange", "").withArgument("x-dead-letter-routing-key", LLM_NER_SERVICE_DLQ).build();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue llmNerResponseQueue() {
|
||||
|
||||
return QueueBuilder.durable(LLM_NER_SERVICE_RESPONSE_QUEUE)
|
||||
.withArgument("x-dead-letter-exchange", "")
|
||||
.withArgument("x-dead-letter-routing-key", LLM_NER_SERVICE_DLQ)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue llmNerResponseDLQ() {
|
||||
|
||||
return QueueBuilder.durable(LLM_NER_SERVICE_DLQ).build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
package com.knecon.fforesight.llm.service.websocket;
|
||||
|
||||
import org.springframework.messaging.simp.SimpMessagingTemplate;
|
||||
import org.springframework.security.core.parameters.P;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.llm.service.services.WebSocketMessagingTemplate;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class WebSocketMessagingService implements WebSocketMessagingTemplate {
|
||||
|
||||
SimpMessagingTemplate messagingTemplate;
|
||||
|
||||
|
||||
@Override
|
||||
public void sendEvent(String userId, String token, Object payload) {
|
||||
|
||||
messagingTemplate.convertAndSendToUser(userId, token, payload);
|
||||
}
|
||||
|
||||
}
|
||||
@ -21,8 +21,8 @@ spring:
|
||||
prefetch: 1
|
||||
|
||||
llm-service:
|
||||
azureOpenAiKey: "Your Azure open Api Key"
|
||||
azureOpenAiEndpoint: "Your Azure open Api Endpoint"
|
||||
azureOpenAiKey: "679b023858314dfe807e50a2e7d86d63"
|
||||
azureOpenAiEndpoint: "https://knecon-ca-demo.openai.azure.com/"
|
||||
|
||||
fforesight:
|
||||
llm-service:
|
||||
@ -9,6 +9,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.autoconfigure.actuate.observability.AutoConfigureObservability;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
@ -26,6 +27,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
import com.knecon.fforesight.keycloakcommons.DefaultKeyCloakCommonsAutoConfiguration;
|
||||
import com.knecon.fforesight.swaggercommons.SpringDocAutoConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantResponse;
|
||||
@ -64,6 +68,7 @@ public abstract class AbstractLlmServiceIntegrationTest {
|
||||
@SuppressWarnings("PMD.TestClassWithoutTestCases")
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
@ImportAutoConfiguration({StorageAutoConfiguration.class, MultiTenancyAutoConfiguration.class, SpringDocAutoConfiguration.class, DefaultKeyCloakCommonsAutoConfiguration.class})
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
public static class TestConfiguration {
|
||||
|
||||
@ -0,0 +1,81 @@
|
||||
package com.knecon.fforesight.llm.service;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.knecon.fforesight.llm.service.services.LlmNerService;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
|
||||
|
||||
public static final String DOCUMENT_TEXT = "DOCUMENT_TEXT";
|
||||
public static final String DOCUMENT_POSITIONS = "DOCUMENT_POSITION";
|
||||
public static final String DOCUMENT_STRUCTURE = "DOCUMENT_STRUCTURE";
|
||||
public static final String DOCUMENT_PAGES = "DOCUMENT_PAGES";
|
||||
public static final String DOCUMENT_CHUNKS = "DOCUMENT_CHUNKS";
|
||||
@Autowired
|
||||
LlmNerService llmNerService;
|
||||
|
||||
Set<String> relevantFiles = Set.of(DOCUMENT_TEXT, DOCUMENT_POSITIONS, DOCUMENT_STRUCTURE, DOCUMENT_PAGES, DOCUMENT_CHUNKS);
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLlmNer() {
|
||||
|
||||
Path folder = Path.of("/home/kschuettler/Downloads/New Folder (2)/2f4cc06f-d941-4f87-8928-b5d8a9476387/75ecec8c698f561c91d1a3e9f81dad7c");
|
||||
LlmNerMessage message = prepStorage(folder);
|
||||
llmNerService.runNer(message);
|
||||
Path tmpFile = Path.of("tmp", "AAA_LLM_ENTITIES", "entities.json");
|
||||
Files.createDirectories(tmpFile.getParent());
|
||||
storageService.downloadTo(TEST_TENANT, message.getResultStorageId(), tmpFile.toFile());
|
||||
}
|
||||
|
||||
|
||||
private LlmNerMessage prepStorage(Path folder) throws IOException {
|
||||
|
||||
LlmNerMessage message = buildMessage(folder);
|
||||
Files.walk(folder)
|
||||
.filter(path -> path.toFile().isFile())
|
||||
.filter(path -> relevantFiles.stream()
|
||||
.anyMatch(filePath -> path.getFileName().toString().contains(filePath)))
|
||||
.forEach(relevantFile -> storeFile(relevantFile, folder));
|
||||
return message;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void storeFile(Path relevantFile, Path folder) {
|
||||
|
||||
try (var in = new FileInputStream(relevantFile.toFile())) {
|
||||
storageService.storeObject(TenantContext.getTenantId(),
|
||||
folder + relevantFiles.stream()
|
||||
.filter(filePath -> relevantFile.getFileName().toString().contains(filePath))
|
||||
.findFirst()
|
||||
.orElseThrow(),
|
||||
in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static LlmNerMessage buildMessage(Path folder) {
|
||||
|
||||
return LlmNerMessage.builder()
|
||||
.chunksStorageId(folder + DOCUMENT_CHUNKS)
|
||||
.documentPagesStorageId(folder + DOCUMENT_PAGES)
|
||||
.documentTextStorageId(folder + DOCUMENT_TEXT)
|
||||
.documentPositionStorageId(folder + DOCUMENT_POSITIONS)
|
||||
.documentStructureStorageId(folder + DOCUMENT_STRUCTURE)
|
||||
.resultStorageId(folder + "result")
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,7 +2,7 @@ server:
|
||||
port: 28080
|
||||
fforesight:
|
||||
keycloak:
|
||||
enabled: false
|
||||
enabled: true
|
||||
springdoc:
|
||||
enabled: false
|
||||
|
||||
@ -13,5 +13,5 @@ keyword-service.url: "http://mock.url"
|
||||
|
||||
|
||||
llm-service:
|
||||
azureOpenAiKey: "Your Azure open Api Key"
|
||||
azureOpenAiEndpoint: "Your Azure open Api Endpoint"
|
||||
azureOpenAiKey: "679b023858314dfe807e50a2e7d86d63"
|
||||
azureOpenAiEndpoint: "https://knecon-ca-demo.openai.azure.com/"
|
||||
@ -1,8 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
dir=${PWD##*/}
|
||||
|
||||
gradle assemble
|
||||
|
||||
buildNumber=${1:-1}
|
||||
# Get the current Git branch
|
||||
branch=$(git rev-parse --abbrev-ref HEAD)
|
||||
|
||||
gradle bootBuildImage --cleanCache --publishImage -Pversion=$USER-$buildNumber
|
||||
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber"
|
||||
# Get the short commit hash (first 5 characters)
|
||||
commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
|
||||
# Combine branch and commit hash
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
|
||||
|
||||
newImageName="nexus.knecon.com:5001/ff/llm-service-server:${buildName}"
|
||||
|
||||
echo "full image name:"
|
||||
echo ${newImageName}
|
||||
echo ""
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
namespace=${1}
|
||||
deployment_name="llm-service"
|
||||
|
||||
echo "deploying to ${namespace}"
|
||||
|
||||
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
|
||||
|
||||
if [ "${newImageName}" = "${oldImageName}" ]; then
|
||||
echo "Image tag did not change, redeploying..."
|
||||
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
|
||||
else
|
||||
echo "upgrading the image tag..."
|
||||
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
|
||||
fi
|
||||
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
|
||||
echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"
|
||||
|
||||
@ -1 +1,7 @@
|
||||
rootProject.name = "llm-service"
|
||||
rootProject.name = "llm-service"
|
||||
include(":llm-service-api")
|
||||
include(":llm-service-server")
|
||||
include(":llm-service-processor")
|
||||
project(":llm-service-api").projectDir = file("llm-service/llm-service-api")
|
||||
project(":llm-service-server").projectDir = file("llm-service/llm-service-server")
|
||||
project(":llm-service-processor").projectDir = file("llm-service/llm-service-processor")
|
||||
@ -1,15 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.api;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ErrorMessage {
|
||||
|
||||
private final String message;
|
||||
private OffsetDateTime timestamp = OffsetDateTime.now();
|
||||
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class ChatEvent {
|
||||
|
||||
String token;
|
||||
|
||||
}
|
||||
@ -1,19 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.api.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class PromptList {
|
||||
|
||||
private List<String> prompts = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,325 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.model;
|
||||
|
||||
public class SystemMessages {
|
||||
|
||||
public static String RULES_CO_PILOT = """
|
||||
From now on, you are a Drools rule generator. This means you will start your answer with a step-by-step explanation how to write a rule, which will fulfill the prompt, followed by the rule.
|
||||
|
||||
You have a document structure written in Java with the following objects:
|
||||
|
||||
- Section
|
||||
- Table
|
||||
- TableCell
|
||||
- Paragraph
|
||||
- Headline
|
||||
- Page
|
||||
- TextEntity
|
||||
- EntityCreationService
|
||||
|
||||
The Section, Table, TableCell, Paragraph, and Headline implement a common interface called SemanticNode. SemanticNodes are arranged in a tree-like fashion, where any SemanticNode can have multiple SemanticNodes as children. The arrangement is as follows:
|
||||
- Tables only have TableCells as children.
|
||||
- TableCells may have any child, except TableCells.
|
||||
- Paragraphs and Headlines have no children.
|
||||
- Sections may have any child except TableCells, but if it contains Paragraphs as well as Tables, it is split into a Section with multiple Sections as children, where any child Section only contains either Tables or Paragraphs.
|
||||
Further, if the first SemanticNode is a Headline it remains the first child in the Parent Section, before any subsections. You can assume there are no null values.
|
||||
It is also important to assume, that the document structure might be faulty in its design and to write the rule as robust as possible.
|
||||
|
||||
The goal of the rules is to identify pieces of text that should be extracted. In order to represent the pieces of text we create a TextEntity using the entityCreationService.
|
||||
For example, we want to extract all Authors and addresses of a document. Or all personally identifiable information, such as E-Mails and Telephone Numbers.
|
||||
TextEntities may also represent other pieces of text, such as published information, or certain species of vertebrates.
|
||||
The TextEntities are part of the document structure, such that they are referenced in each SemanticNode and Page which contains it. Further, the TextEntity references each Page and SemanticNode whose text it intersects.
|
||||
|
||||
A rule is written by first enforcing conditions on a SemanticNode in the when-block and then a call to the EntityCreationService is performed in the then-block.
|
||||
The created entity then needs to be applied or removed, depending on the use case. Apply means it will be extracted, removed means it will not.
|
||||
It is important to only call methods that change the state inside the then-block and NEVER call them in the when block. Not even with the "from" keyword.
|
||||
This includes all entityCreationService methods or apply/remove on a TextEntity,
|
||||
Finally, the rule has to be formatted as such: the name starts with an Identifier following the pattern /w+./d+./d+, then a ":" and finally a descriptive name.
|
||||
The identifier must match with the identifier used in the apply/remove calls on the TextEntity.
|
||||
|
||||
Examples:
|
||||
A rule which finds all text until the end of a line, when the line starts with "Authors:" as the type "CBI_author".
|
||||
This rule searches for any section containing the String "Authors:" and then creates a TextEntity after that string in that section.
|
||||
rule "CBI.17.0: Add CBI_author for lines after AUTHORS:"
|
||||
when
|
||||
$section: Section(containsString("Authors:"))
|
||||
then
|
||||
entityCreationService.lineAfterString("Authors:", "CBI_author", EntityType.ENTITY, $section)
|
||||
.forEach(entity -> entity.apply("CBI.17.0", "Line after \\"Authors:\\""));
|
||||
end
|
||||
|
||||
A rule, which finds contact information as type "PII" by looking at the text after certain keywords:
|
||||
This rule iterates through all relevant keywords and sections, for each match the lineAfterString method is called, creating a new TextEntity.
|
||||
rule "PII.4.0: Extract line after contact information keywords"
|
||||
when
|
||||
$contactKeyword: String() from List.of("Contact point:",
|
||||
"Contact:",
|
||||
"Alternative contact:",
|
||||
"European contact:",
|
||||
"No:",
|
||||
"Contact:",
|
||||
"Tel.:",
|
||||
"Tel:",
|
||||
"Telephone number:",
|
||||
"Telephone No:",
|
||||
"Telephone:",
|
||||
"Phone No.",
|
||||
"Phone:",
|
||||
"Fax number:",
|
||||
"Fax:",
|
||||
"E-mail:",
|
||||
"Email:",
|
||||
"e-mail:",
|
||||
"E-mail address:")
|
||||
$section: Section(containsString($contactKeyword))
|
||||
then
|
||||
entityCreationService.lineAfterString($contactKeyword, "PII", EntityType.ENTITY, $section)
|
||||
.forEach(contactEntity -> contactEntity.apply("PII.4.0", "Found after \\"" + $contactKeyword + "\\" contact keyword");
|
||||
end
|
||||
|
||||
A rule which finds all Emails as type "PII" using a regex:
|
||||
This rule first searches for a section containing the character "@", since the execution of the regex might be costly to run on all sections.
|
||||
rule "PII.1.0: Extract Emails by RegEx"
|
||||
when
|
||||
$section: Section(containsString("@"))
|
||||
then
|
||||
entityCreationService.byRegex("\\\\b([A-Za-z0-9._%+\\\\-]+@[A-Za-z0-9.\\\\-]+\\\\.[A-Za-z\\\\-]{1,23}[A-Za-z])\\\\b", "PII", EntityType.ENTITY, 1, $section)
|
||||
.forEach(emailEntity -> emailEntity.apply("PII.1.0", "Found by Email Regex");
|
||||
end
|
||||
|
||||
A rule which extracts all paragraphs of a section with the Headline "Mortality"
|
||||
This rule first identifies a Headline containing the word "Mortality" and then extracts all paragraphs as TextEntities using the entityCreationService.
|
||||
rule "DOC.14.0: Mortality"
|
||||
when
|
||||
$headline: Headline(containsString("Mortality"))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.14.0", "Mortality found", "n-a"));
|
||||
end
|
||||
|
||||
A rule which extracts all paragraphs of a Section and its Subsections with the Headline "Study Design"
|
||||
This rule uses the SectionIdentifier to find all headlines which are children of the headline "Study Design". From these headlines it finds the sections using the getParent() method.
|
||||
This is done, since the document structure might be faulty, so using the SectionIdentifier is more robust. The SectionIdentifier describes the numbers in front of a Headline,
|
||||
e.g. "3.0 Study Design" -> "3.0"
|
||||
rule "DOC.20.1: Study Design"
|
||||
when
|
||||
Headline(containsStringIgnoreCase("Study Design"), $sectionIdentifier: getSectionIdentifier())
|
||||
$headline: Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "study_design", EntityType.ENTITY)
|
||||
.forEach(entity -> {
|
||||
entity.apply("DOC.20.1", "Study design section found", "n-a");
|
||||
});
|
||||
end
|
||||
|
||||
A rule which extracts each TableCell with the header 'Author' or 'Author(s)'
|
||||
This rule first identifies a header TableCell containing the word 'Author' or 'Author(s)' and then finds any TableCell underneath it and creates a TextEntity from it.
|
||||
rule "CBI.11.0: Extract and recommend TableCell with header 'Author' or 'Author(s)'"
|
||||
when
|
||||
$table: Table(hasHeader("Author(s)") || hasHeader("Author"))
|
||||
TableCell(isHeader(), containsAnyStringIgnoreCase("Author", "Author(s)"), $authorCol: col) from $table.streamHeaders().toList()
|
||||
$authorCell: TableCell() from $table.streamCol($authorCol).toList()
|
||||
then
|
||||
entityCreationService.bySemanticNode($authorCell, "CBI_author", EntityType.ENTITY)
|
||||
.ifPresent(authorEntity -> authorEntity.apply("CBI.11.0", "Author header found");
|
||||
end
|
||||
|
||||
A rule which extracts all authors in each row that represents a vertebrate study
|
||||
This rule uses the when block to first identify TableCells that are headers containing a String "Author" or "Author(s)" and a header cell "Vertebrate Study Y/N".
|
||||
It then identifies cells underneath the second header containing a String like "Y" or "Yes" and finally a cell in the same row as the previous cell and same column as the Author header cell.
|
||||
Each match is then turned into an entity using the entityCreationService in the then-block.
|
||||
rule "CBI.12.0: Extract and recommend TableCell with header 'Author' or 'Author(s)' and header 'Vertebrate study Y/N' with value 'Yes'"
|
||||
when
|
||||
$table: Table(hasHeader("Author(s)") || hasHeader("Author"), hasHeaderIgnoreCase("Vertebrate Study Y/N"))
|
||||
TableCell(isHeader(), containsAnyStringIgnoreCase("Author", "Author(s)"), $authorCol: col) from $table.streamHeaders().toList()
|
||||
TableCell(isHeader(), containsStringIgnoreCase("Vertebrate study Y/N"), $vertebrateCol: col) from $table.streamHeaders().toList()
|
||||
TableCell(!isHeader(), containsAnyString("Yes", "Y"), $rowWithYes: row) from $table.streamCol($vertebrateCol).toList()
|
||||
$authorCell: TableCell(row == $rowWithYes) from $table.streamCol($authorCol).toList()
|
||||
then
|
||||
entityCreationService.bySemanticNode($authorCell, "CBI_author", EntityType.ENTITY)
|
||||
.ifPresent(authorEntity -> authorEntity.apply("CBI.12.0", "Extracted because it's row belongs to a vertebrate study");
|
||||
end
|
||||
|
||||
|
||||
Below are all functions you can use, listed by their respective class. Make sure to only use functions that are listed here or are included in java 17
|
||||
EntityCreationService:
|
||||
betweenTextRanges(List<TextRange> startBoundaries, List<TextRange> stopBoundaries, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenTextRanges(List<TextRange> startBoundaries, List<TextRange> stopBoundaries, String type, EntityType entityType, SemanticNode node, int limit) -> Stream<TextEntity>
|
||||
byTextRange(TextRange textRange, String type, EntityType entityType, SemanticNode node) -> Optional<TextEntity>
|
||||
byRegexWithLineBreaks(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegexWithLineBreaks(String regexPattern, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegex(String regexPattern, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) -> Stream<TextEntity>
|
||||
byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
bySemanticNode(SemanticNode node, String type, EntityType entityType) -> Optional<TextEntity>
|
||||
betweenStrings(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
shortestBetweenAnyString(List<String> starts, List<String> stops, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
shortestBetweenAnyStringIgnoreCase(List<String> starts, List<String> stops, String type, EntityType entityType, SemanticNode node, int limit) -> Stream<TextEntity>
|
||||
shortestBetweenAnyStringIgnoreCase(List<String> starts, List<String> stops, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
betweenRegexesIgnoreCase(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
lineAfterStrings(List<String> strings, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
lineAfterStringsIgnoreCase(List<String> strings, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
lineAfterString(String string, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
lineAfterStringIgnoreCase(String string, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
lineAfterStringAcrossColumns(String string, String type, EntityType entityType, Table tableNode) -> Stream<TextEntity>
|
||||
lineAfterStringAcrossColumnsIgnoreCase(String string, String type, EntityType entityType, Table tableNode) -> Stream<TextEntity>
|
||||
semanticNodeAfterString(String string, String type, EntityType entityType, SemanticNode node) -> Optional<TextEntity>
|
||||
byString(String keyword, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
byStringIgnoreCase(String keyword, String type, EntityType entityType, SemanticNode node) -> Stream<TextEntity>
|
||||
bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType) -> Stream<TextEntity>
|
||||
bySemanticNodeParagraphsOnlyMergeConsecutive(SemanticNode node, String type, EntityType entityType) -> Stream<TextEntity>
|
||||
byPrefixExpansionRegex(TextEntity entity, String regexPattern) -> Optional<TextEntity>
|
||||
bySuffixExpansionRegex(TextEntity entity, String regexPattern) -> Optional<TextEntity>
|
||||
|
||||
SemanticNode:
|
||||
length() -> int
|
||||
getParent() -> SemanticNode
|
||||
getType() -> NodeType
|
||||
isLeaf() -> boolean
|
||||
getTextBlock() -> TextBlock
|
||||
getPages() -> Set<Page>
|
||||
getPages(TextRange textRange) -> Set<Page>
|
||||
getTextRange() -> TextRange
|
||||
getHeadline() -> Headline
|
||||
getSectionIdentifier() -> SectionIdentifier
|
||||
getNextSibling() -> Optional<SemanticNode>
|
||||
getPreviousSibling() -> Optional<SemanticNode>
|
||||
getEntities() -> Set<TextEntity>
|
||||
getBBox() -> Map<Page, Rectangle2D>
|
||||
streamChildren() -> Stream<SemanticNode>
|
||||
getFirstPage() -> Page
|
||||
onPage(int pageNumber) -> boolean
|
||||
hasParent() -> boolean
|
||||
getHighestParent() -> SemanticNode
|
||||
hasEntitiesOfType(String type) -> boolean
|
||||
hasEntitiesOfAnyType(String... types) -> boolean
|
||||
hasEntitiesOfAllTypes(String... types) -> boolean
|
||||
getEntitiesOfType(List<String> types) -> List<TextEntity>
|
||||
getEntitiesOfType(String type) -> List<TextEntity>
|
||||
getEntitiesOfType(String... types) -> List<TextEntity>
|
||||
hasText() -> boolean
|
||||
containsString(String string) -> boolean
|
||||
containsAllStrings(String... strings) -> boolean
|
||||
containsAnyString(String... strings) -> boolean
|
||||
containsAnyString(List<String> strings) -> boolean
|
||||
containsStringIgnoreCase(String string) -> boolean
|
||||
containsAnyStringIgnoreCase(String... strings) -> boolean
|
||||
containsAllStringsIgnoreCase(String... strings) -> boolean
|
||||
containsWord(String word) -> boolean
|
||||
containsWordIgnoreCase(String word) -> boolean
|
||||
containsAnyWord(String... words) -> boolean
|
||||
containsAnyWordIgnoreCase(String... words) -> boolean
|
||||
containsAllWords(String... words) -> boolean
|
||||
containsAllWordsIgnoreCase(String... words) -> boolean
|
||||
matchesRegex(String regexPattern) -> boolean
|
||||
matchesRegexIgnoreCase(String regexPattern) -> boolean
|
||||
streamChildrenOfType(NodeType nodeType) -> Stream<SemanticNode>
|
||||
streamAllSubNodes() -> Stream<SemanticNode>
|
||||
streamAllSubNodesOfType(NodeType nodeType) -> Stream<SemanticNode>
|
||||
containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) -> boolean
|
||||
intersectsRectangle(int x, int y, int w, int h, int pageNumber) -> boolean
|
||||
getSectionIdentifier() -> SectionIdentifier
|
||||
|
||||
Section:
|
||||
hasTables() -> boolean
|
||||
anyHeadlineContainsString(String value) -> boolean
|
||||
anyHeadlineContainsStringIgnoreCase(String value) -> boolean
|
||||
|
||||
SectionIdentifier:
|
||||
isParentOf(SectionIdentifier sectionIdentifier) -> boolean
|
||||
isChildOf(SectionIdentifier sectionIdentifier) -> boolean
|
||||
|
||||
Table:
|
||||
rowContainsStringsIgnoreCase(Integer row, List<String> strings) -> boolean
|
||||
streamRow(int row) -> Stream<TableCell>
|
||||
streamHeaders() -> Stream<TableCell>
|
||||
streamTableCells() -> Stream<TableCell>
|
||||
streamCol(int col) -> Stream<TableCell>
|
||||
streamTableCellsWithHeader(String header) -> Stream<TableCell>
|
||||
getCell(int row, int col) -> TableCell
|
||||
streamTableCellsWhichContainType(String type) -> Stream<TableCell>
|
||||
streamHeadersForCell(int row, int col) -> Stream<TableCell>
|
||||
hasHeader(String header) -> boolean
|
||||
hasHeaderIgnoreCase(String header) -> boolean
|
||||
hasRowWithHeaderAndValue(String header, String value) -> boolean
|
||||
hasRowWithHeaderAndAnyValue(String header, List<String> values) -> boolean
|
||||
getEntitiesOfTypeInSameRow(String type, TextEntity textEntity) -> List<TextEntity>
|
||||
getNumberOfRows() -> int
|
||||
getNumberOfCols() -> int
|
||||
|
||||
TableCell:
|
||||
getRow() -> int
|
||||
getCol() -> int
|
||||
isHeader() -> boolean
|
||||
|
||||
TextEntity:
|
||||
type() -> String
|
||||
value() -> String
|
||||
getEntityType() -> EntityType
|
||||
removed() -> boolean
|
||||
skipped() -> boolean
|
||||
active() -> boolean
|
||||
applied() -> boolean
|
||||
ignored() -> boolean
|
||||
getId() -> String
|
||||
legalBasis() -> String
|
||||
references() -> Set<TextEntity>
|
||||
length() -> int
|
||||
isType(String type) -> boolean
|
||||
getMatchedRule() -> MatchedRule
|
||||
getDeepestFullyContainingNode() -> SemanticNode
|
||||
getTextRange() -> TextRange
|
||||
contains(TextEntity textEntity) -> boolean
|
||||
containedBy(TextEntity textEntity) -> boolean
|
||||
intersects(TextEntity textEntity) -> boolean
|
||||
occursInNodeOfType(Class<? extends SemanticNode> clazz) -> boolean
|
||||
occursInNode(SemanticNode semanticNode) -> boolean
|
||||
isAnyType(List<String> types) -> boolean
|
||||
isDictionaryEntry() -> boolean
|
||||
isDossierDictionaryEntry() -> boolean
|
||||
getEngines() -> Set<Engine>
|
||||
getTextBefore() -> String
|
||||
getTextAfter() -> String
|
||||
getPages() -> Set<Page>
|
||||
getIntersectingNodes() -> List<SemanticNode>
|
||||
apply(String ruleIdentifier, String reason) -> void
|
||||
apply(String ruleIdentifier, String reason, String legalBasis) -> void
|
||||
redact(String ruleIdentifier, String reason, String legalBasis) -> void
|
||||
skip(String ruleIdentifier, String reason) -> void
|
||||
ignore(String ruleIdentifier, String reason) -> void
|
||||
remove(String ruleIdentifier, String reason) -> void
|
||||
applyWithLineBreaks(String ruleIdentifier, String reason, String legalBasis) -> void
|
||||
applyWithReferences(String ruleIdentifier, String reason, String legalBasis, Collection<TextEntity> references) -> void
|
||||
skipWithReferences(String ruleIdentifier, String reason, Collection<TextEntity> references) -> void
|
||||
|
||||
EntityType:
|
||||
ENTITY,
|
||||
HINT,
|
||||
RECOMMENDATION,
|
||||
FALSE_POSITIVE,
|
||||
FALSE_RECOMMENDATION
|
||||
|
||||
TextRange:
|
||||
length() -> int
|
||||
end() -> int
|
||||
start() -> int
|
||||
contains(int start, int end) -> boolean
|
||||
contains(int index) -> boolean
|
||||
contains(TextRange textRange) -> boolean
|
||||
merge(Collection<TextRange> textRanges) -> TextRange
|
||||
containedBy(int start, int end) -> boolean
|
||||
containedBy(TextRange textRange) -> boolean
|
||||
intersects(TextRange textRange) -> boolean
|
||||
""";
|
||||
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.queue;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class MessageHandler {
|
||||
|
||||
// @SneakyThrows
|
||||
// @RabbitHandler
|
||||
// @RabbitListener(queues = "#{llmServiceSettings.getRequestQueueName()}")
|
||||
// public void receiveIndexingRequest(Message message) {
|
||||
//
|
||||
// // TODO: Do something.
|
||||
// }
|
||||
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.queue;
|
||||
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Configuration
|
||||
@RequiredArgsConstructor
|
||||
public class MessagingConfiguration {
|
||||
|
||||
}
|
||||
@ -1,69 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.messaging.simp.SimpMessagingTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.ai.openai.OpenAIAsyncClient;
|
||||
import com.azure.ai.openai.OpenAIClientBuilder;
|
||||
import com.azure.ai.openai.models.ChatCompletions;
|
||||
import com.azure.ai.openai.models.ChatCompletionsOptions;
|
||||
import com.azure.ai.openai.models.ChatMessage;
|
||||
import com.azure.ai.openai.models.ChatRole;
|
||||
import com.azure.core.credential.AzureKeyCredential;
|
||||
import com.knecon.fforesight.llm.service.api.model.ChatEvent;
|
||||
import com.knecon.fforesight.llm.service.model.SystemMessages;
|
||||
import com.knecon.fforesight.llm.service.settings.LlmServiceSettings;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import reactor.core.publisher.Flux;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LlmService {
|
||||
|
||||
private final SimpMessagingTemplate websocketTemplate;
|
||||
private final LlmServiceSettings settings;
|
||||
private OpenAIAsyncClient client;
|
||||
|
||||
|
||||
@PostConstruct
|
||||
public void init() {
|
||||
|
||||
client = new OpenAIClientBuilder().credential(new AzureKeyCredential(settings.getAzureOpenAiKey())).endpoint(settings.getAzureOpenAiEndpoint()).buildAsyncClient();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void rulesCopilot(List<String> prompt, String userId) {
|
||||
|
||||
List<ChatMessage> chatMessages = new ArrayList<>();
|
||||
chatMessages.add(new ChatMessage(ChatRole.SYSTEM, SystemMessages.RULES_CO_PILOT));
|
||||
chatMessages.addAll(prompt.stream()
|
||||
.map(p -> new ChatMessage(ChatRole.USER, p))
|
||||
.toList());
|
||||
ChatCompletionsOptions options = new ChatCompletionsOptions(chatMessages);
|
||||
options.setStream(true);
|
||||
Flux<ChatCompletions> chatCompletions = client.getChatCompletionsStream(settings.getModel(), options);
|
||||
chatCompletions.subscribe(chatCompletion -> {
|
||||
sendWebsocketEvent(userId,
|
||||
chatCompletion.getChoices()
|
||||
.get(0).getDelta().getContent());
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private void sendWebsocketEvent(String userId, String token) {
|
||||
|
||||
websocketTemplate.convertAndSendToUser(userId, "/queue/" + TenantContext.getTenantId() + "/rules-copilot", new ChatEvent(token));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,25 +0,0 @@
|
||||
package com.knecon.fforesight.llm.service.settings;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Primary
|
||||
@Configuration
|
||||
@ConfigurationProperties("llm-service")
|
||||
public class LlmServiceSettings {
|
||||
|
||||
private String requestQueueName = "llm_request_queue";
|
||||
private String responseQueueName = "llm_response_queue";
|
||||
private String deadLetterQueueName = "llm_dead_letter_queue";
|
||||
|
||||
|
||||
private String azureOpenAiKey;
|
||||
private String azureOpenAiEndpoint;
|
||||
private String model = "gpt-4-cqs-dev";
|
||||
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user