RED-10072: AI description field and toggle for entities

* initial draft
This commit is contained in:
maverickstuder 2024-10-14 12:29:40 +02:00
parent 022dc2197a
commit 1c28851ecc
5 changed files with 64 additions and 35 deletions

View File

@ -27,5 +27,6 @@ public class LlmNerMessage {
String documentPositionStorageId;
String documentPagesStorageId;
String resultStorageId;
long aiCreationVersion;
}

View File

@ -20,5 +20,6 @@ public class LlmNerResponseMessage {
int promptTokens;
int completionTokens;
int duration;
long aiCreationVersion;
}

View File

@ -357,6 +357,7 @@ public class SystemMessageProvider {
sb.append("\n**Instructions:**\n\n");
sb.append("1. **Entity Handling**:\n");
sb.append(" - Use the classes described above and only those for classification.\n");
sb.append(" - Include all relevant entities. Prefer inclusion over omission.\n");
sb.append(" - Avoid duplicates within each category.\n");
sb.append(" - Assign each entity to only one category, prioritizing specificity.");
@ -383,26 +384,28 @@ public class SystemMessageProvider {
sb.append("but except that, ensure that the entities in the JSON exactly match the text from the document, preserving the original formatting and casing.\n");
sb.append(" - Ensure there is no additional text or explanation outside the JSON structure.\n\n");
sb.append("**Example 1:**\n\n");
sb.append("_Entities Searched: PERSON, PII, ADDRESS, COMPANY_\n\n");
sb.append("**Input:**\n```\nContact Bob at bob@techcorp.com or visit TechCorp HQ at 456 Tech Avenue, New York, NY 10001 USA.\n```\n\n");
sb.append("**Output:**\n```json\n{\n");
sb.append(" \"PERSON\": [\"Bob\"],\n");
sb.append(" \"PII\": [\"bob@techcorp.com\"],\n");
sb.append(" \"ADDRESS\": [\"456 Tech Avenue, New York, NY 10001 USA\"],\n");
sb.append(" \"COMPANY\": [\"TechCorp\"],\n");
sb.append("}\n```\n\n");
// examples would possibly be beneficial but cause hallucinations
sb.append("**Example 2:**\n\n");
sb.append("_Entities Searched: EVENT, PRODUCT, DATE, LOCATION_\n\n");
sb.append("**Input:**\n```\nThe launch event for the new XYZ Smartphone is scheduled on September 30, 2024, at the Grand Convention Center in Berlin.");
sb.append("You can pre-order the device starting from August 15, 2024.\n```\n\n");
sb.append("**Output:**\n```json\n{\n");
sb.append(" \"EVENT\": [\"launch event\"],\n");
sb.append(" \"PRODUCT\": [\"XYZ Smartphone\"],\n");
sb.append(" \"DATE\": [\"September 30, 2024\", \"August 15, 2024\"],\n");
sb.append(" \"LOCATION\": [\"Grand Convention Center\", \"Berlin\"]\n");
sb.append("}\n```\n\n");
// sb.append("**Example 1:**\n\n");
// sb.append("_Entities Searched: PERSON, PII, ADDRESS, COMPANY_\n\n");
// sb.append("**Input:**\n```\nContact Bob at bob@techcorp.com or visit TechCorp HQ at 456 Tech Avenue, New York, NY 10001 USA.\n```\n\n");
// sb.append("**Output:**\n```json\n{\n");
// sb.append(" \"PERSON\": [\"Bob\"],\n");
// sb.append(" \"PII\": [\"bob@techcorp.com\"],\n");
// sb.append(" \"ADDRESS\": [\"456 Tech Avenue, New York, NY 10001 USA\"],\n");
// sb.append(" \"COMPANY\": [\"TechCorp\"],\n");
// sb.append("}\n```\n\n");
//
// sb.append("**Example 2:**\n\n");
// sb.append("_Entities Searched: EVENT, PRODUCT, DATE, LOCATION_\n\n");
// sb.append("**Input:**\n```\nThe launch event for the new XYZ Smartphone is scheduled on September 30, 2024, at the Grand Convention Center in Berlin.");
// sb.append("You can pre-order the device starting from August 15, 2024.\n```\n\n");
// sb.append("**Output:**\n```json\n{\n");
// sb.append(" \"EVENT\": [\"launch event\"],\n");
// sb.append(" \"PRODUCT\": [\"XYZ Smartphone\"],\n");
// sb.append(" \"DATE\": [\"September 30, 2024\", \"August 15, 2024\"],\n");
// sb.append(" \"LOCATION\": [\"Grand Convention Center\", \"Berlin\"]\n");
// sb.append("}\n```\n\n");
return sb.toString();
}

View File

@ -49,7 +49,8 @@ public class MessageHandler {
LlmNerResponseMessage llmNerResponseMessage = new LlmNerResponseMessage(llmNerMessage.getIdentifier(),
usage.promptTokenCount(),
usage.completionTokenCount(),
Math.toIntExact(usage.durationMillis()));
Math.toIntExact(usage.durationMillis()),
llmNerMessage.getAiCreationVersion());
log.info("LLM NER finished for {}", llmNerMessage.getIdentifier());
sendFinishedMessage(llmNerResponseMessage, message);
}

View File

@ -4,6 +4,8 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -16,14 +18,15 @@ import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.SneakyThrows;
@Disabled
//@Disabled
public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
public static final String DOCUMENT_TEXT = "DOCUMENT_TEXT";
public static final String DOCUMENT_POSITIONS = "DOCUMENT_POSITION";
public static final String DOCUMENT_STRUCTURE = "DOCUMENT_STRUCTURE";
public static final String DOCUMENT_PAGES = "DOCUMENT_PAGES";
public static final String DOCUMENT_CHUNKS = "DOCUMENT_CHUNKS";
public static final String DOCUMENT_TEXT = "DOCUMENT_TEXT.proto";
public static final String DOCUMENT_POSITIONS = "DOCUMENT_POSITION.proto";
public static final String DOCUMENT_STRUCTURE = "DOCUMENT_STRUCTURE.proto";
public static final String DOCUMENT_PAGES = "DOCUMENT_PAGES.proto";
public static final String DOCUMENT_CHUNKS = "DOCUMENT_CHUNKS.json";
public static final String STORAGE_ID = "08904e84-4a5a-4c15-bc13-200237af6434/4d81e891fd3e94dfe0b6c51073ef55b6.";
@Autowired
LlmNerService llmNerService;
@ -34,10 +37,10 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
@SneakyThrows
public void testLlmNer() {
Path folder = Path.of("/home/kschuettler/Downloads/New Folder (5)/18299ec0-7659-496a-a44a-194bbffb1700/1fb7d49ae389469f60db516cf81a3510");
Path folder = Path.of("/Users/maverickstuder/Downloads/10-09-2024-16-03-47_files_list");
LlmNerMessage message = prepStorage(folder);
llmNerService.runNer(message);
Path tmpFile = Path.of("tmp", "AAA_LLM_ENTITIES", "entities.json");
Path tmpFile = Path.of("/private/tmp", "LLM_ENTITIES", "entities.json");
Files.createDirectories(tmpFile.getParent());
storageService.downloadTo(TEST_TENANT, message.getResultStorageId(), tmpFile.toFile());
}
@ -60,7 +63,7 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
try (var in = new FileInputStream(relevantFile.toFile())) {
storageService.storeObject(TenantContext.getTenantId(),
folder + relevantFiles.stream()
STORAGE_ID + relevantFiles.stream()
.filter(filePath -> relevantFile.getFileName().toString().contains(filePath))
.findFirst()
.orElseThrow(),
@ -71,14 +74,34 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest {
private static LlmNerMessage buildMessage(Path folder) {
List<EntityAiDescription> entityAiDescriptions = new ArrayList<>();
// Add descriptions for each entity type with examples
entityAiDescriptions.add(new EntityAiDescription("PERSON",
"A PERSON is any name referring to a human, excluding named methods (e.g., 'Klingbeil Test' is not a name). Each name should be its own entity, but first name, last name, and possibly middle name should be merged. Numbers are never part of a name. "
+ "For example: 'Jennifer Durando, BS', 'Charlène Hernandez', 'Shaw A.', 'G J J Lubbe'."));
entityAiDescriptions.add(new EntityAiDescription("PII",
"PII refers to personally identifiable information such as email addresses, telephone numbers, fax numbers, or any other information that could uniquely identify an individual. "
+ "For example: '01223 45678', 'mimi.lang@smithcorp.com', '+44 (0)1252 392460'."));
entityAiDescriptions.add(new EntityAiDescription("ADDRESS",
"An ADDRESS describes a real-life location. It should be as complete as possible and may include elements such as street address, city, state, postal code, and country. "
+ "For example: 'Product Safety Labs 2394 US Highway 130 Dayton, NJ 08810 USA', 'Syngenta Crop Protection, LLC 410 Swing Road Post Office Box 18300 Greensboro, NC 27419-8300 USA'."));
entityAiDescriptions.add(new EntityAiDescription("COMPANY",
"A COMPANY is any corporate entity or approving body mentioned in the text, excluding companies mentioned as part of an address. "
+ "For example: 'Syngenta', 'EFSA'."));
entityAiDescriptions.add(new EntityAiDescription("COUNTRY",
"A COUNTRY is any recognized nation mentioned in the text. Countries mentioned as part of an address should not be listed separately. "
+ "For example: 'USA'."));
return LlmNerMessage.builder()
.identifier(Map.of("file", folder.getFileName().toString()))
.chunksStorageId(folder + DOCUMENT_CHUNKS)
.documentPagesStorageId(folder + DOCUMENT_PAGES)
.documentTextStorageId(folder + DOCUMENT_TEXT)
.documentPositionStorageId(folder + DOCUMENT_POSITIONS)
.documentStructureStorageId(folder + DOCUMENT_STRUCTURE)
.resultStorageId(folder + "result")
.entityAiDescriptions(entityAiDescriptions)
.chunksStorageId(STORAGE_ID + DOCUMENT_CHUNKS)
.documentPagesStorageId(STORAGE_ID + DOCUMENT_PAGES)
.documentTextStorageId(STORAGE_ID + DOCUMENT_TEXT)
.documentPositionStorageId(STORAGE_ID + DOCUMENT_POSITIONS)
.documentStructureStorageId(STORAGE_ID + DOCUMENT_STRUCTURE)
.resultStorageId(STORAGE_ID + "result")
.build();
}