Spike: LLM NER

This commit is contained in:
Kilian Schüttler 2024-08-27 18:00:59 +02:00
parent 5ebe82b7ce
commit 4dc0a1fbdc
10 changed files with 127 additions and 53 deletions

View File

@ -28,6 +28,8 @@ tasks.named<Test>("test") {
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "2048m"
}
tasks.test {

View File

@ -17,6 +17,7 @@ val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0"
val persistenceServiceVersion = "2.532.0"
val llmServiceVersion = "1.11.0"
val springBootStarterVersion = "3.1.5"
val springCloudVersion = "4.0.4"
val testContainersVersion = "1.19.7"
@ -39,7 +40,7 @@ dependencies {
exclude(group = "com.knecon.fforesight", module = "tenant-commons")
}
implementation("com.knecon.fforesight:layoutparser-service-internal-api:${layoutParserVersion}")
implementation("com.knecon.fforesight:llm-service-api:${llmServiceVersion}")
implementation("com.iqser.red.commons:spring-commons:6.2.0")
implementation("com.iqser.red.commons:metric-commons:2.3.0")
@ -48,7 +49,7 @@ dependencies {
implementation("com.knecon.fforesight:tenant-commons:0.28.0")
implementation("com.knecon.fforesight:keycloak-commons:0.30.0")
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.knecon.fforesight:lifecycle-commons:0.6.0")
implementation("com.knecon.fforesight:lifecycle-commons:0.7.0")
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")

View File

@ -24,6 +24,8 @@ public class RedactionServiceSettings {
private boolean azureNerServiceEnabled;
private boolean llmNerServiceEnabled;
private boolean priorityMode;
private long dictionaryCacheMaximumSize = 100;

View File

@ -10,7 +10,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
/**
@ -65,16 +64,22 @@ public class NerEntities {
}
/**
* Represents a single NER entity with its value, text range, and type.
*/
public record NerEntity(String value, TextRange textRange, String type, Double confidence) {
public record NerEntity(String value, TextRange textRange, String type, Double confidence, Engine engine) {
public NerEntity(String value, TextRange textRange, String type) {
this(value, textRange, type, null);
this(value, textRange, type, null, Engine.NER);
}
}
public enum Engine {
NER,
CLOUD_NER,
LLM_NER
}
}

View File

@ -33,6 +33,7 @@ import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.service.document.AreaGroupAnnotationService;
@ -332,19 +333,38 @@ public class AnalysisPreparationService {
NerEntities nerEntities;
if (redactionServiceSettings.isNerServiceEnabled()) {
nerEntities = NerEntitiesAdapter.toNerEntities(redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId()), document);
nerEntities = NerEntitiesAdapter.toNerEntities(redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId()),
document,
NerEntities.Engine.NER);
} else {
nerEntities = new NerEntities(Collections.emptyList());
}
if (redactionServiceSettings.isAzureNerServiceEnabled()) {
NerEntitiesModel azureNerEntitiesModel = redactionStorageService.getAzureNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
NerEntities azureNerEntities = NerEntitiesAdapter.toNerEntities(azureNerEntitiesModel, document);
NerEntities azureNerEntities = NerEntitiesAdapter.toNerEntities(azureNerEntitiesModel, document, NerEntities.Engine.CLOUD_NER);
nerEntities.merge(azureNerEntities);
}
if (redactionServiceSettings.isLlmNerServiceEnabled()) {
NerEntities llmNerEntities = getLlmNerEntities(analyzeRequest);
nerEntities.merge(llmNerEntities);
}
return nerEntities;
}
private NerEntities getLlmNerEntities(AnalyzeRequest analyzeRequest) {
return new NerEntities(redactionStorageService.getLlmNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId()).getEntities()
.stream()
.map(e -> new NerEntities.NerEntity(e.getValue(),
new TextRange(e.getStartOffset(), e.getEndOffset()),
e.getType(),
null,
NerEntities.Engine.LLM_NER))
.toList());
}
private EntityLog getEntityLog(AnalyzeRequest analyzeRequest) {
EntityLog entityLogWithoutEntries = redactionStorageService.getEntityLogWithoutEntries(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
@ -392,16 +412,20 @@ public class AnalysisPreparationService {
if (redactionServiceSettings.isNerServiceEnabled()) {
NerEntitiesModel nerEntitiesModel = redactionStorageService.getNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
nerEntitiesModel = filterNerEntitiesModelBySectionIds(sectionsToReanalyseIds, nerEntitiesModel);
nerEntities = NerEntitiesAdapter.toNerEntities(nerEntitiesModel, document);
nerEntities = NerEntitiesAdapter.toNerEntities(nerEntitiesModel, document, NerEntities.Engine.NER);
} else {
nerEntities = new NerEntities(Collections.emptyList());
}
if (redactionServiceSettings.isAzureNerServiceEnabled()) {
NerEntitiesModel azureNerEntitiesModel = redactionStorageService.getAzureNerEntities(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
azureNerEntitiesModel = filterNerEntitiesModelBySectionIds(sectionsToReanalyseIds, azureNerEntitiesModel);
NerEntities azureNerEntities = NerEntitiesAdapter.toNerEntities(azureNerEntitiesModel, document);
NerEntities azureNerEntities = NerEntitiesAdapter.toNerEntities(azureNerEntitiesModel, document, NerEntities.Engine.CLOUD_NER);
nerEntities.merge(azureNerEntities);
}
if (redactionServiceSettings.isLlmNerServiceEnabled()) {
NerEntities llmNerEntities = getLlmNerEntities(analyzeRequest);
nerEntities.merge(llmNerEntities);
}
return nerEntities;
}

View File

@ -44,15 +44,29 @@ public class NerEntitiesAdapter {
*
* @param nerEntitiesModel the Entities just as the NER Service returns them
* @param document the document structure, from which the NER Service found the entities
* @param engine the origin of the NerEntities
* @return a stream of validated entities
*/
public NerEntities toNerEntities(NerEntitiesModel nerEntitiesModel, Document document, NerEntities.Engine engine) {
return new NerEntities(addOffsetsAndFlatten(getStringStartOffsetsForMainSectionsHeadersFooters(document), nerEntitiesModel).map(nerEntityModel -> new NerEntities.NerEntity(
nerEntityModel.getValue(),
new TextRange(nerEntityModel.getStartOffset(), nerEntityModel.getEndOffset()),
nerEntityModel.getType(),
nerEntityModel.getConfidence(),
engine))
.toList());
}
public NerEntities toNerEntities(NerEntitiesModel nerEntitiesModel, Document document) {
return new NerEntities(addOffsetsAndFlatten(getStringStartOffsetsForMainSectionsHeadersFooters(document), nerEntitiesModel).map(nerEntityModel -> new NerEntities.NerEntity(
nerEntityModel.getValue(),
new TextRange(nerEntityModel.getStartOffset(), nerEntityModel.getEndOffset()),
nerEntityModel.getType(),
nerEntityModel.getConfidence()))
nerEntityModel.getConfidence(),
NerEntities.Engine.NER))
.toList());
}

View File

@ -31,6 +31,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
import com.iqser.red.storage.commons.exception.StorageObjectDoesNotExist;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.llm.service.LlmNerEntities;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
@ -306,6 +307,12 @@ public class RedactionStorageService {
}
public LlmNerEntities getLlmNerEntities(String dossierId, String fileId) {
return storageService.readJSONObject(TenantContext.getTenantId(), StorageIdUtils.getStorageId(dossierId, fileId, FileType.LLM_NER_ENTITIES), LlmNerEntities.class);
}
@Timed("redactmanager_getAzureNerEntities")
public NerEntitiesModel getAzureNerEntities(String dossierId, String fileId) {

View File

@ -25,6 +25,7 @@ import org.bson.BsonArray;
import org.bson.BsonDocument;
import org.bson.BsonString;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.mockito.stubbing.Answer;
import org.springframework.amqp.rabbit.core.RabbitAdmin;
@ -218,6 +219,13 @@ public abstract class AbstractRedactionIntegrationTest {
protected TenantAuthenticationManagerResolver tenantAuthenticationManagerResolver;
@BeforeAll
public static void init() {
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
}
@BeforeEach
public void setup() {
@ -237,8 +245,6 @@ public abstract class AbstractRedactionIntegrationTest {
when(tenantProvider.getTenant(any())).thenReturn(builder().tenantId("redaction").mongoDBConnection(mongoDBConnection).build());
tenantMongoLiquibaseExecutor.initializeTenant("redaction");
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
}
@ -283,8 +289,10 @@ public abstract class AbstractRedactionIntegrationTest {
true));
when(dictionaryClient.getDictionaryForType(DOSSIER_AUTHOR_TYPE_ID, version)).then((Answer<Type>) invocation -> getDictionaryResponse(DICTIONARY_AUTHOR, true));
when(dictionaryClient.getAllTypesForDossierTemplate(TEST_DOSSIER_TEMPLATE_ID, version, true)).then((Answer<List<Type>>)invocation -> (getTemplateDictionaryTypeResponse()));
when(dictionaryClient.getAllTypesForDossier(TEST_DOSSIER_ID, version, true)).then((Answer<List<Type>>)invocation -> ((getDossierDictionaryTypeResponse())));
when(dictionaryClient.getAllTypesForDossierTemplate(TEST_DOSSIER_TEMPLATE_ID,
version,
true)).then((Answer<List<Type>>) invocation -> (getTemplateDictionaryTypeResponse()));
when(dictionaryClient.getAllTypesForDossier(TEST_DOSSIER_ID, version, true)).then((Answer<List<Type>>) invocation -> ((getDossierDictionaryTypeResponse())));
}
@ -559,7 +567,6 @@ public abstract class AbstractRedactionIntegrationTest {
}
protected List<Type> getDossierDictionaryTypeResponse() {
return dossierDictionary.keySet()

View File

@ -5,6 +5,8 @@ import static com.iqser.red.service.redaction.v1.server.testcontainers.MongoDBTe
import static com.iqser.red.service.redaction.v1.server.testcontainers.MongoDBTestContainer.MONGO_USERNAME;
import static com.knecon.fforesight.tenantcommons.model.TenantResponse.builder;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyBoolean;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.when;
import java.io.File;
@ -12,6 +14,7 @@ import java.io.FileInputStream;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
@ -36,6 +39,7 @@ import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.data.redis.listener.RedisMessageListenerContainer;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
@ -47,7 +51,9 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileTyp
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.persistence.service.v1.api.shared.model.group.GroupAnnotationInternalResponse;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.GroupRedactionClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
@ -56,6 +62,7 @@ import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryMode
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.service.websocket.RedisSyncedWebSocketService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.testcontainers.MongoDBTestContainer;
import com.iqser.red.service.redaction.v1.server.utils.exception.NotFoundException;
@ -83,52 +90,48 @@ import lombok.extern.slf4j.Slf4j;
* This way you can recreate what is happening on the stack almost exactly.
*/ public class AnalysisEnd2EndTest {
Path dossierTemplateToUse = Path.of(
"/Users/maverickstuder/Documents/syngenta/redactmanager/prod-cp-eu-reg/EFSA_sanitisation_pre_GFL_v1"); // Add your dossier-template here
Path dossierTemplateToUse = Path.of("/home/kschuettler/iqser/business-logic/documine/cpglobal/Flora SCM (Do Not Edit)"); // Add your dossier-template here
ObjectMapper mapper = ObjectMapperFactory.create();
final String TENANT_ID = "tenant";
TestDossierTemplate testDossierTemplate;
@Autowired
StorageService storageService;
@Autowired
protected AnalyzeService analyzeService;
@MockBean
DictionaryService dictionaryService;
@MockBean
RabbitTemplate rabbitTemplate;
TestDossierTemplate testDossierTemplate;
@MockBean
protected LegalBasisClient legalBasisClient;
@MockBean
protected RulesClient rulesClient;
@MockBean
protected DictionaryClient dictionaryClient;
@MockBean
private MongoConnectionProvider mongoConnectionProvider;
@MockBean
private TenantProvider tenantProvider;
@Autowired
protected MongoTestConfig mongoTestConfig;
@Autowired
protected TenantMongoLiquibaseExecutor tenantMongoLiquibaseExecutor;
@MockBean
protected TenantAuthenticationManagerResolver tenantAuthenticationManagerResolver;
@MockBean
protected GroupRedactionClient groupRedactionClient;
@MockBean
private RedisSyncedWebSocketService redisSyncedWebSocketService;
@MockBean
private RedisMessageListenerContainer redisPubsubContainer;
@MockBean
DictionaryService dictionaryService;
@MockBean
RabbitTemplate rabbitTemplate;
@MockBean
protected LegalBasisClient legalBasisClient;
@MockBean
protected RulesClient rulesClient;
@MockBean
protected DictionaryClient dictionaryClient;
@MockBean
private MongoConnectionProvider mongoConnectionProvider;
@MockBean
private TenantProvider tenantProvider;
@Test
@SneakyThrows
public void runAnalysisEnd2End() {
String folder = "/Users/maverickstuder/Documents/RedactManager/redaction-service/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/files_end2end/file0"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
String folder = "/home/kschuettler/Downloads/New Folder (4)/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327"; // Should contain all files from minio directly, still zipped. Can contain multiple files.
Path absoluteFolderPath;
if (folder.startsWith("files")) { // if it starts with "files" it is most likely in the resources folder, else it should be an absolute path
@ -175,7 +178,7 @@ import lombok.extern.slf4j.Slf4j;
when(mongoConnectionProvider.getMongoDBConnection(any())).thenReturn(mongoDBConnection);
when(tenantProvider.getTenant(any())).thenReturn(builder().tenantId(TENANT_ID).mongoDBConnection(mongoDBConnection).build());
tenantMongoLiquibaseExecutor.initializeTenant("redaction");
tenantMongoLiquibaseExecutor.initializeTenant(TENANT_ID);
testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse);
when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0));
@ -185,6 +188,10 @@ import lombok.extern.slf4j.Slf4j;
String type = invocation.getArgument(0);
return testDossierTemplate.testDictionary.isHint(type);
});
when(groupRedactionClient.getGroupAnnotations(anyString(), anyString(), anyBoolean())).thenReturn(GroupAnnotationInternalResponse.builder()
.textGroupAnnotations(new ArrayList<>())
.areaGroupAnnotations(new ArrayList<>())
.build());
when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> {
String type = invocation.getArgument(0);
return testDossierTemplate.testDictionary.getType(type).getColor();
@ -284,9 +291,14 @@ import lombok.extern.slf4j.Slf4j;
if (fileType.isEmpty()) {
return Optional.empty();
}
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
if (path.getFileName().endsWith(".gz")) {
try (var fis = new FileInputStream(path.toFile()); var in = new GZIPInputStream(fis);) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
}
} else {
try (var in = new FileInputStream(path.toFile())) {
storageService.storeObject(TENANT_ID, RedactionStorageService.StorageIdUtils.getStorageId(request.getDossierId(), request.getFileId(), fileType.get()), in);
}
}
return fileType;
}

View File

@ -133,9 +133,9 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
mockDictionaryCalls(null);
when(groupRedactionClient.getGroupAnnotations(anyString(), anyString(), anyBoolean())).thenReturn(GroupAnnotationInternalResponse.builder()
.textGroupAnnotations(Collections.emptyList())
.areaGroupAnnotations(Collections.emptyList())
.build());
.textGroupAnnotations(Collections.emptyList())
.areaGroupAnnotations(Collections.emptyList())
.build());
when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors);
}