diff --git a/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerMessage.java b/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerMessage.java index a679b7d..c49e604 100644 --- a/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerMessage.java +++ b/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerMessage.java @@ -27,5 +27,6 @@ public class LlmNerMessage { String documentPositionStorageId; String documentPagesStorageId; String resultStorageId; + long aiCreationVersion; } diff --git a/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerResponseMessage.java b/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerResponseMessage.java index 69ec7ca..239fef8 100644 --- a/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerResponseMessage.java +++ b/llm-service/llm-service-api/src/main/java/com/knecon/fforesight/llm/service/LlmNerResponseMessage.java @@ -20,5 +20,6 @@ public class LlmNerResponseMessage { int promptTokens; int completionTokens; int duration; + long aiCreationVersion; } diff --git a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/SystemMessageProvider.java b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/SystemMessageProvider.java index 0d7ad71..cd9435a 100644 --- a/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/SystemMessageProvider.java +++ b/llm-service/llm-service-processor/src/main/java/com/knecon/fforesight/llm/service/SystemMessageProvider.java @@ -357,6 +357,7 @@ public class SystemMessageProvider { sb.append("\n**Instructions:**\n\n"); sb.append("1. **Entity Handling**:\n"); + sb.append(" - Use the classes described above and only those for classification.\n"); sb.append(" - Include all relevant entities. Prefer inclusion over omission.\n"); sb.append(" - Avoid duplicates within each category.\n"); sb.append(" - Assign each entity to only one category, prioritizing specificity."); @@ -383,26 +384,28 @@ public class SystemMessageProvider { sb.append("but except that, ensure that the entities in the JSON exactly match the text from the document, preserving the original formatting and casing.\n"); sb.append(" - Ensure there is no additional text or explanation outside the JSON structure.\n\n"); - sb.append("**Example 1:**\n\n"); - sb.append("_Entities Searched: PERSON, PII, ADDRESS, COMPANY_\n\n"); - sb.append("**Input:**\n```\nContact Bob at bob@techcorp.com or visit TechCorp HQ at 456 Tech Avenue, New York, NY 10001 USA.\n```\n\n"); - sb.append("**Output:**\n```json\n{\n"); - sb.append(" \"PERSON\": [\"Bob\"],\n"); - sb.append(" \"PII\": [\"bob@techcorp.com\"],\n"); - sb.append(" \"ADDRESS\": [\"456 Tech Avenue, New York, NY 10001 USA\"],\n"); - sb.append(" \"COMPANY\": [\"TechCorp\"],\n"); - sb.append("}\n```\n\n"); + // examples would possibly be beneficial but cause hallucinations - sb.append("**Example 2:**\n\n"); - sb.append("_Entities Searched: EVENT, PRODUCT, DATE, LOCATION_\n\n"); - sb.append("**Input:**\n```\nThe launch event for the new XYZ Smartphone is scheduled on September 30, 2024, at the Grand Convention Center in Berlin."); - sb.append("You can pre-order the device starting from August 15, 2024.\n```\n\n"); - sb.append("**Output:**\n```json\n{\n"); - sb.append(" \"EVENT\": [\"launch event\"],\n"); - sb.append(" \"PRODUCT\": [\"XYZ Smartphone\"],\n"); - sb.append(" \"DATE\": [\"September 30, 2024\", \"August 15, 2024\"],\n"); - sb.append(" \"LOCATION\": [\"Grand Convention Center\", \"Berlin\"]\n"); - sb.append("}\n```\n\n"); +// sb.append("**Example 1:**\n\n"); +// sb.append("_Entities Searched: PERSON, PII, ADDRESS, COMPANY_\n\n"); +// sb.append("**Input:**\n```\nContact Bob at bob@techcorp.com or visit TechCorp HQ at 456 Tech Avenue, New York, NY 10001 USA.\n```\n\n"); +// sb.append("**Output:**\n```json\n{\n"); +// sb.append(" \"PERSON\": [\"Bob\"],\n"); +// sb.append(" \"PII\": [\"bob@techcorp.com\"],\n"); +// sb.append(" \"ADDRESS\": [\"456 Tech Avenue, New York, NY 10001 USA\"],\n"); +// sb.append(" \"COMPANY\": [\"TechCorp\"],\n"); +// sb.append("}\n```\n\n"); +// +// sb.append("**Example 2:**\n\n"); +// sb.append("_Entities Searched: EVENT, PRODUCT, DATE, LOCATION_\n\n"); +// sb.append("**Input:**\n```\nThe launch event for the new XYZ Smartphone is scheduled on September 30, 2024, at the Grand Convention Center in Berlin."); +// sb.append("You can pre-order the device starting from August 15, 2024.\n```\n\n"); +// sb.append("**Output:**\n```json\n{\n"); +// sb.append(" \"EVENT\": [\"launch event\"],\n"); +// sb.append(" \"PRODUCT\": [\"XYZ Smartphone\"],\n"); +// sb.append(" \"DATE\": [\"September 30, 2024\", \"August 15, 2024\"],\n"); +// sb.append(" \"LOCATION\": [\"Grand Convention Center\", \"Berlin\"]\n"); +// sb.append("}\n```\n\n"); return sb.toString(); } diff --git a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/MessageHandler.java b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/MessageHandler.java index ee40a56..10bd502 100644 --- a/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/MessageHandler.java +++ b/llm-service/llm-service-server/src/main/java/com/knecon/fforesight/llm/service/queue/MessageHandler.java @@ -49,7 +49,8 @@ public class MessageHandler { LlmNerResponseMessage llmNerResponseMessage = new LlmNerResponseMessage(llmNerMessage.getIdentifier(), usage.promptTokenCount(), usage.completionTokenCount(), - Math.toIntExact(usage.durationMillis())); + Math.toIntExact(usage.durationMillis()), + llmNerMessage.getAiCreationVersion()); log.info("LLM NER finished for {}", llmNerMessage.getIdentifier()); sendFinishedMessage(llmNerResponseMessage, message); } diff --git a/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java b/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java index 620171f..98eaa5e 100644 --- a/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java +++ b/llm-service/llm-service-server/src/test/java/com/knecon/fforesight/llm/service/LlmNerServiceTest.java @@ -4,6 +4,8 @@ import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; import java.util.Map; import java.util.Set; @@ -16,14 +18,15 @@ import com.knecon.fforesight.tenantcommons.TenantContext; import lombok.SneakyThrows; -@Disabled +//@Disabled public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest { - public static final String DOCUMENT_TEXT = "DOCUMENT_TEXT"; - public static final String DOCUMENT_POSITIONS = "DOCUMENT_POSITION"; - public static final String DOCUMENT_STRUCTURE = "DOCUMENT_STRUCTURE"; - public static final String DOCUMENT_PAGES = "DOCUMENT_PAGES"; - public static final String DOCUMENT_CHUNKS = "DOCUMENT_CHUNKS"; + public static final String DOCUMENT_TEXT = "DOCUMENT_TEXT.proto"; + public static final String DOCUMENT_POSITIONS = "DOCUMENT_POSITION.proto"; + public static final String DOCUMENT_STRUCTURE = "DOCUMENT_STRUCTURE.proto"; + public static final String DOCUMENT_PAGES = "DOCUMENT_PAGES.proto"; + public static final String DOCUMENT_CHUNKS = "DOCUMENT_CHUNKS.json"; + public static final String STORAGE_ID = "08904e84-4a5a-4c15-bc13-200237af6434/4d81e891fd3e94dfe0b6c51073ef55b6."; @Autowired LlmNerService llmNerService; @@ -34,10 +37,10 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest { @SneakyThrows public void testLlmNer() { - Path folder = Path.of("/home/kschuettler/Downloads/New Folder (5)/18299ec0-7659-496a-a44a-194bbffb1700/1fb7d49ae389469f60db516cf81a3510"); + Path folder = Path.of("/Users/maverickstuder/Downloads/10-09-2024-16-03-47_files_list"); LlmNerMessage message = prepStorage(folder); llmNerService.runNer(message); - Path tmpFile = Path.of("tmp", "AAA_LLM_ENTITIES", "entities.json"); + Path tmpFile = Path.of("/private/tmp", "LLM_ENTITIES", "entities.json"); Files.createDirectories(tmpFile.getParent()); storageService.downloadTo(TEST_TENANT, message.getResultStorageId(), tmpFile.toFile()); } @@ -60,7 +63,7 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest { try (var in = new FileInputStream(relevantFile.toFile())) { storageService.storeObject(TenantContext.getTenantId(), - folder + relevantFiles.stream() + STORAGE_ID + relevantFiles.stream() .filter(filePath -> relevantFile.getFileName().toString().contains(filePath)) .findFirst() .orElseThrow(), @@ -71,14 +74,34 @@ public class LlmNerServiceTest extends AbstractLlmServiceIntegrationTest { private static LlmNerMessage buildMessage(Path folder) { + List entityAiDescriptions = new ArrayList<>(); + + // Add descriptions for each entity type with examples + entityAiDescriptions.add(new EntityAiDescription("PERSON", + "A PERSON is any name referring to a human, excluding named methods (e.g., 'Klingbeil Test' is not a name). Each name should be its own entity, but first name, last name, and possibly middle name should be merged. Numbers are never part of a name. " + + "For example: 'Jennifer Durando, BS', 'Charlène Hernandez', 'Shaw A.', 'G J J Lubbe'.")); + entityAiDescriptions.add(new EntityAiDescription("PII", + "PII refers to personally identifiable information such as email addresses, telephone numbers, fax numbers, or any other information that could uniquely identify an individual. " + + "For example: '01223 45678', 'mimi.lang@smithcorp.com', '+44 (0)1252 392460'.")); + entityAiDescriptions.add(new EntityAiDescription("ADDRESS", + "An ADDRESS describes a real-life location. It should be as complete as possible and may include elements such as street address, city, state, postal code, and country. " + + "For example: 'Product Safety Labs 2394 US Highway 130 Dayton, NJ 08810 USA', 'Syngenta Crop Protection, LLC 410 Swing Road Post Office Box 18300 Greensboro, NC 27419-8300 USA'.")); + entityAiDescriptions.add(new EntityAiDescription("COMPANY", + "A COMPANY is any corporate entity or approving body mentioned in the text, excluding companies mentioned as part of an address. " + + "For example: 'Syngenta', 'EFSA'.")); + entityAiDescriptions.add(new EntityAiDescription("COUNTRY", + "A COUNTRY is any recognized nation mentioned in the text. Countries mentioned as part of an address should not be listed separately. " + + "For example: 'USA'.")); + return LlmNerMessage.builder() .identifier(Map.of("file", folder.getFileName().toString())) - .chunksStorageId(folder + DOCUMENT_CHUNKS) - .documentPagesStorageId(folder + DOCUMENT_PAGES) - .documentTextStorageId(folder + DOCUMENT_TEXT) - .documentPositionStorageId(folder + DOCUMENT_POSITIONS) - .documentStructureStorageId(folder + DOCUMENT_STRUCTURE) - .resultStorageId(folder + "result") + .entityAiDescriptions(entityAiDescriptions) + .chunksStorageId(STORAGE_ID + DOCUMENT_CHUNKS) + .documentPagesStorageId(STORAGE_ID + DOCUMENT_PAGES) + .documentTextStorageId(STORAGE_ID + DOCUMENT_TEXT) + .documentPositionStorageId(STORAGE_ID + DOCUMENT_POSITIONS) + .documentStructureStorageId(STORAGE_ID + DOCUMENT_STRUCTURE) + .resultStorageId(STORAGE_ID + "result") .build(); }