From 795f8fd31bbc3732b3e96d526e34327dd9b4ac79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Mon, 2 Sep 2024 16:24:11 +0200 Subject: [PATCH] entitylog-mapping: add test to map entitylogs to existing DocumentData --- .../server/model/document/nodes/Document.java | 4 +- .../service/EntityLogCreatorService.java | 2 +- .../document/EntityFindingUtility.java | 17 +- .../MapEntityLogToDocumentDataTest.java | 425 ++++++++++++++++++ 4 files changed, 436 insertions(+), 12 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/MapEntityLogToDocumentDataTest.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java index 2c1e75be..37347aef 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/model/document/nodes/Document.java @@ -63,8 +63,8 @@ public class Document extends AbstractSemanticNode { * * @return A list of main sections within the document * @deprecated This method is marked for removal. - * Use {@link #streamChildrenOfType(NodeType)} instead, - * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. + * Use {@link #streamChildrenOfType(NodeType)} instead, + * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. */ @Deprecated(forRemoval = true) public List
getMainSections() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/EntityLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/EntityLogCreatorService.java index 182b93f5..a5145b8f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/EntityLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/EntityLogCreatorService.java @@ -148,7 +148,7 @@ public class EntityLogCreatorService { } - private List createEntityLogEntries(Document document, AnalyzeRequest analyzeRequest, List notFoundPrecursorEntries, int analysisNumber) { + public List createEntityLogEntries(Document document, AnalyzeRequest analyzeRequest, List notFoundPrecursorEntries, int analysisNumber) { String dossierTemplateId = analyzeRequest.getDossierTemplateId(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityFindingUtility.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityFindingUtility.java index 70797490..15df7085 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityFindingUtility.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/service/document/EntityFindingUtility.java @@ -17,6 +17,7 @@ import java.util.stream.Collectors; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import com.google.common.collect.Sets; import com.iqser.red.service.redaction.v1.server.model.ClosestEntity; import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity; import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage; @@ -26,7 +27,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionO import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode; -import com.iqser.red.service.redaction.v1.server.service.DictionaryService; import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations; import lombok.extern.slf4j.Slf4j; @@ -36,14 +36,12 @@ import lombok.extern.slf4j.Slf4j; public class EntityFindingUtility { EntityCreationService entityCreationService; - DictionaryService dictionaryService; @Autowired - public EntityFindingUtility(EntityEnrichmentService entityEnrichmentService, DictionaryService dictionaryService) { + public EntityFindingUtility(EntityEnrichmentService entityEnrichmentService) { entityCreationService = new EntityCreationService(entityEnrichmentService); - this.dictionaryService = dictionaryService; } @@ -172,12 +170,13 @@ public class EntityFindingUtility { if (!pageNumbers.stream() .allMatch(node::onPage)) { - throw new IllegalArgumentException(format("SemanticNode \"%s\" does not contain these pages %s, it has pages: %s", + throw new IllegalArgumentException(format("SemanticNode \"%s\" is missing pages %s", node, - pageNumbers.stream() - .filter(pageNumber -> !node.onPage(pageNumber)) - .toList(), - node.getPages())); + Sets.difference(pageNumbers, + node.getPages() + .stream() + .map(Page::getNumber) + .collect(Collectors.toSet())))); } SearchImplementation searchImplementation = new SearchImplementation(entryValues.stream() diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/MapEntityLogToDocumentDataTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/MapEntityLogToDocumentDataTest.java new file mode 100644 index 00000000..25129e83 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/MapEntityLogToDocumentDataTest.java @@ -0,0 +1,425 @@ +package com.iqser.red.service.redaction.v1.server; + +import static com.iqser.red.service.redaction.v1.server.service.document.EntityFromPrecursorCreationService.createCorrectEntity; +import static org.mockito.Mockito.when; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.FilterType; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.Primary; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Sets; +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest; +import com.iqser.red.service.persistence.service.v1.api.shared.model.FileStatus; +import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType; +import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLog; +import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry; +import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryState; +import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryType; +import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity; +import com.iqser.red.service.redaction.v1.server.model.document.DocumentData; +import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; +import com.iqser.red.service.redaction.v1.server.service.DictionaryService; +import com.iqser.red.service.redaction.v1.server.service.EntityLogCreatorService; +import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper; +import com.iqser.red.service.redaction.v1.server.service.document.EntityFindingUtility; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; +import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration; +import com.knecon.fforesight.tenantcommons.TenantContext; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Disabled +@Slf4j +@ExtendWith(SpringExtension.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class) +public class MapEntityLogToDocumentDataTest extends AbstractRedactionIntegrationTest { + + public static final int MATCH_THRESHOLD = 25; + ObjectMapper mapper = ObjectMapperFactory.create(); + Path outputFolder = Path.of("/tmp/MIGRATED_ENTITY_LOGS"); + + @Autowired + RedactionServiceSettings redactionServiceSettings; + @Autowired + EntityFindingUtility entityFindingUtility; + @Autowired + EntityLogCreatorService entityLogCreatorService; + @Autowired + private DictionaryService dictionaryService; + + private AtomicLong totalEntities; + private AtomicLong totalNotFoundEntities; + private AtomicLong totalRemovedDuplicates; + + @Configuration + @EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class}) + @Import({LayoutParsingServiceProcessorConfiguration.class}) + @ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)}) + public static class RedactionIntegrationTestConfiguration { + + @Bean + @Primary + public StorageService inmemoryStorage() { + + return new FileSystemBackedStorageService(ObjectMapperFactory.create()); + } + + } + + + @BeforeEach + public void stubClients() { + + TenantContext.setTenantId("redaction"); + + when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis()); + when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.COMPONENT)).thenReturn(-1L); + + loadDictionaryForTest(); + loadTypeForTest(); + loadNerForTest(); + when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(0L); + when(dictionaryClient.getAllTypesForDossierTemplate(TEST_DOSSIER_TEMPLATE_ID, null, true)).thenReturn(getTemplateDictionaryTypeResponse()); + + when(dictionaryClient.getVersionForDossier(TEST_DOSSIER_ID)).thenReturn(0L); + when(dictionaryClient.getAllTypesForDossier(TEST_DOSSIER_ID, null, true)).thenReturn(getDossierDictionaryTypeResponse()); + + mockDictionaryCalls(null); + + when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors); + dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID); + totalNotFoundEntities = new AtomicLong(); + totalEntities = new AtomicLong(); + totalRemovedDuplicates = new AtomicLong(); + } + + + @Test + public void migrateEntityLog() { + + Path entityLogPath = Path.of("/home/kschuettler/Dokumente/TestFiles/NER Dataset/1c4f5fa9a36d67d285f051f3a22813c4.ENTITY_LOG.json"); + Path documentStructurePath = Path.of( + "/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_STRUCTURE.json"); + Path documentTextPath = Path.of( + "/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_TEXT.json"); + Path documentPositionsPath = Path.of( + "/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_POSITION.json"); + Path documentPagesPath = Path.of( + "/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_PAGES.json"); + Path outputPath = Path.of("/tmp/migratedEntityLog.json"); + + runForFile(entityLogPath, documentStructurePath, documentTextPath, documentPositionsPath, documentPagesPath, outputPath); + } + + + @Test + @SneakyThrows + public void migrateAllEntityLogs() { + + Path entityLogFolder = Path.of("/home/kschuettler/Dokumente/TestFiles/NER Dataset/"); + Path fileExchangeFolder = Path.of("/home/kschuettler/Downloads/New Folder (8)/"); + Files.walk(fileExchangeFolder) + .filter(file -> file.getFileName().toString().endsWith("FILE_STATUS.json")) + .forEach(path -> { + FileStatus fileStatus = readFileStatus(path); + String originalFileName = fileStatus.getFilename().split("\\.")[0]; + Path fileFolder = path.getParent(); + runForFile(entityLogFolder.resolve(originalFileName + ".ENTITY_LOG.json"), + fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_STRUCTURE.json"), + fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_TEXT.json"), + fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_POSITION.json"), + fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_PAGES.json"), + outputFolder.resolve(fileStatus.getId() + ".ENTITY_LOG.json")); + }); + + log.info("All EntityLogs migrated and written to {}, did not find {} of {}, removed {} duplicate", + outputFolder, + totalNotFoundEntities.get(), + totalEntities.get(), + totalRemovedDuplicates.get()); + } + + + @SneakyThrows + private FileStatus readFileStatus(Path path) { + + try (var in = new FileInputStream(path.toFile())) { + return mapper.readValue(in, FileStatus.class); + } + } + + + private void runForFile(Path entityLogPath, Path documentStructurePath, Path documentTextPath, Path documentPositionsPath, Path documentPagesPath, Path outputPath) { + + log.info("Starting entity log migration for file {} and entityLog {}", documentStructurePath, entityLogPath); + + EntityLog entityLog = readEntityLog(entityLogPath); + + Map originalCounts = entityLog.getEntityLogEntry() + .stream() + .filter(e -> !e.getState().equals(EntryState.REMOVED)) + .collect(Collectors.groupingBy(EntityLogEntry::getType, Collectors.counting())); + + Document document = readDocument(documentStructurePath, documentTextPath, documentPositionsPath, documentPagesPath); + List textPrecursorEntities = entityLog.getEntityLogEntry() + .stream() + .filter(entry -> !isImageOrArea(entry)) + .map(this::precursorEntityWithState) + .toList(); + + List imageOrAreaEntries = entityLog.getEntityLogEntry() + .stream() + .filter(MapEntityLogToDocumentDataTest::isImageOrArea) + .toList(); + + List notFoundEntities = new LinkedList<>(); + Map> tempEntitiesByValue = entityFindingUtility.findAllPossibleEntitiesAndGroupByValue(document, textPrecursorEntities); + for (PrecursorEntity precursorEntity : textPrecursorEntities) { + Optional optionalClosestEntity = entityFindingUtility.findClosestEntityAndReturnEmptyIfNotFound(precursorEntity, tempEntitiesByValue, MATCH_THRESHOLD); + if (optionalClosestEntity.isEmpty()) { + notFoundEntities.add(precursorEntity); + continue; + } + createCorrectEntity(precursorEntity, optionalClosestEntity.get(), false); + } + + tempEntitiesByValue.values() + .stream() + .flatMap(Collection::stream) + .forEach(TextEntity::removeFromGraph); + + List entityLogEntries = entityLogCreatorService.createEntityLogEntries(document, + buildTestAnalyzeRequest(), + notFoundEntities, + entityLog.getAnalysisNumber()); + + Map migratedEntityLogEntriesLookup = entityLogEntries.stream() + .collect(Collectors.toMap(EntityLogEntry::getId, Function.identity())); + + addAreaOrImageEntitiesIfNotPresent(migratedEntityLogEntriesLookup, imageOrAreaEntries); + + requireAllPreviousIdsPresent(entityLog, migratedEntityLogEntriesLookup); + + for (EntityLogEntry entityLogEntry : entityLog.getEntityLogEntry()) { + if (entityLogEntry.getState().equals(EntryState.REMOVED)) { + continue; + } + migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setManualChanges(entityLogEntry.getManualChanges()); + migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setChanges(entityLogEntry.getChanges()); + migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setReference(entityLogEntry.getReference()); + if (entityLogEntry.getEntryType().equals(EntryType.IMAGE) || entityLogEntry.getEntryType().equals(EntryType.IMAGE_HINT)) { + migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setState(entityLogEntry.getState()); + migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setReason(entityLogEntry.getReason()); + migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setLegalBasis(entityLogEntry.getLegalBasis()); + } + } + + logMigratedEntityLogDifference(entityLog, migratedEntityLogEntriesLookup); + + entityLog.setEntityLogEntry(new LinkedList<>(migratedEntityLogEntriesLookup.values())); + + Map migratedCounts = entityLog.getEntityLogEntry() + .stream() + .collect(Collectors.groupingBy(EntityLogEntry::getType, Collectors.counting())); + + for (String type : originalCounts.keySet()) { + if (!migratedCounts.containsKey(type)) { + log.error("Type {} missing entirely", type); + continue; + } + long originalCount = originalCounts.get(type); + long migratedCount = migratedCounts.get(type); + if (originalCount != migratedCount) { + log.error("Type {} mismatch {} <-> {}", type, originalCount, migratedCount); + } + + } + saveMigratedEntityLog(outputPath, entityLog); + log.info("Finished entityLog migration, did not find {} of {}", notFoundEntities.size(), migratedEntityLogEntriesLookup.size()); + totalEntities.getAndAdd(migratedEntityLogEntriesLookup.size()); + totalNotFoundEntities.getAndAdd(notFoundEntities.size()); + } + + + private static void logMigratedEntityLogDifference(EntityLog entityLog, Map migratedEntityLogEntriesLookup) { + + if (entityLog.getEntityLogEntry().size() != migratedEntityLogEntriesLookup.values().size()) { + if (Sets.difference(migratedEntityLogEntriesLookup.keySet(), buildIdsToMigrate(entityLog)) + .stream() + .map(migratedEntityLogEntriesLookup::get) + .anyMatch(entry -> !isImageOrArea(entry))) { + + log.error("Entity count mismatch {} <-> {}", entityLog.getEntityLogEntry().size(), migratedEntityLogEntriesLookup.values().size()); + } + } + } + + + private static Set buildIdsToMigrate(EntityLog entityLog) { + + return entityLog.getEntityLogEntry() + .stream() + .filter(e -> e.getState() != EntryState.REMOVED) + .map(EntityLogEntry::getId) + .collect(Collectors.toSet()); + } + + + private static void addAreaOrImageEntitiesIfNotPresent(Map entityLogEntries, List imageOrAreaEntries) { + + for (EntityLogEntry imageOrAreaEntry : imageOrAreaEntries) { + + if (entityLogEntries.containsKey(imageOrAreaEntry.getId())) { + continue; + } + entityLogEntries.put(imageOrAreaEntry.getId(), imageOrAreaEntry); + } + } + + + private static boolean isImageOrArea(EntityLogEntry entry) { + + return entry.getEntryType().equals(EntryType.IMAGE) || entry.getEntryType().equals(EntryType.IMAGE_HINT) || entry.getEntryType().equals(EntryType.AREA); + } + + + private static void requireAllPreviousIdsPresent(EntityLog entityLog, Map migratedEntityLogEntriesLookup) { + + Set existingEntryIds = buildIdsToMigrate(entityLog); + + Set idDiff = Sets.difference(existingEntryIds, migratedEntityLogEntriesLookup.keySet()); + if (!idDiff.isEmpty()) { + log.error("Missing ids: {} are missing ", idDiff); + throw new AssertionError(String.format("Missing ids: %s are missing ", idDiff)); + } + } + + + private static AnalyzeRequest buildTestAnalyzeRequest() { + + return AnalyzeRequest.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).fileId(TEST_FILE_ID).dossierId(TEST_DOSSIER_ID).build(); + } + + + @SneakyThrows + private void saveMigratedEntityLog(Path outputPath, EntityLog entityLog) { + + Files.createDirectories(outputPath.getParent()); + try (var out = new FileOutputStream(outputPath.toFile())) { + mapper.writeValue(out, entityLog); + } + } + + + private PrecursorEntity precursorEntityWithState(EntityLogEntry entityLogEntry) { + + PrecursorEntity precursorEntity = PrecursorEntity.fromEntityLogEntry(entityLogEntry); + switch (entityLogEntry.getState()) { + case APPLIED -> precursorEntity.apply(entityLogEntry.getMatchedRule(), entityLogEntry.getReason(), entityLogEntry.getLegalBasis()); + case SKIPPED -> precursorEntity.skip(entityLogEntry.getMatchedRule(), entityLogEntry.getReason()); + case IGNORED -> precursorEntity.ignore(entityLogEntry.getMatchedRule(), entityLogEntry.getReason()); + case REMOVED, PENDING -> precursorEntity.remove(entityLogEntry.getMatchedRule(), entityLogEntry.getReason()); + } + return precursorEntity; + } + + + @SneakyThrows + private Document readDocument(Path documentStructurePath, Path documentTextPath, Path documentPositionsPath, Path documentPagesPath) { + + var documentDataBuilder = DocumentData.builder(); + try (var in = new FileInputStream(documentStructurePath.toFile())) { + documentDataBuilder.documentStructure(mapper.readValue(in, DocumentStructure.class)); + } + try (var in = new FileInputStream(documentTextPath.toFile())) { + documentDataBuilder.documentTextData(mapper.readValue(in, DocumentTextData[].class)); + } + try (var in = new FileInputStream(documentPositionsPath.toFile())) { + documentDataBuilder.documentPositionData(mapper.readValue(in, DocumentPositionData[].class)); + } + try (var in = new FileInputStream(documentPagesPath.toFile())) { + documentDataBuilder.documentPages(mapper.readValue(in, DocumentPage[].class)); + } + + return DocumentGraphMapper.toDocumentGraph(documentDataBuilder.build()); + } + + + @SneakyThrows + private EntityLog readEntityLog(Path entityLogPath) { + + try (var in = new FileInputStream(entityLogPath.toFile())) { + return removeDuplicates(mapper.readValue(in, EntityLog.class)); + } + } + + + protected EntityLog removeDuplicates(EntityLog entityLog) { + + int duplicateCount = 0; + + HashMap existingAnnotationIds = new HashMap<>(); + + int size = entityLog.getEntityLogEntry().size(); + List cleanedEntries = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + EntityLogEntry entry = entityLog.getEntityLogEntry() + .get(i); + if (!existingAnnotationIds.containsKey(entry.getId())) { + cleanedEntries.add(entry); + existingAnnotationIds.put(entry.getId(), entry); + } else { + duplicateCount++; + log.warn("Duplicate entry found for id {} \nExisting: {}\n Duplicate: {}", entry.getId(), existingAnnotationIds.get(entry.getId()), entry); + if (existingAnnotationIds.get(entry.getId()).getState().equals(EntryState.REMOVED) && !entry.getState().equals(EntryState.REMOVED)) { + cleanedEntries.remove(existingAnnotationIds.get(entry.getId())); + cleanedEntries.add(entry); + } + } + } + entityLog.setEntityLogEntry(cleanedEntries); + log.info("Removed {} duplicates", duplicateCount); + totalRemovedDuplicates.getAndAdd(duplicateCount); + return entityLog; + } + +}