Merge branch 'entitylog-migration' into 'master'

entitylog-mapping: add test to map entitylogs to existing DocumentData

See merge request redactmanager/redaction-service!503
This commit is contained in:
Kilian Schüttler 2024-09-02 16:24:11 +02:00
commit f3cdf46008
4 changed files with 436 additions and 12 deletions

View File

@ -63,8 +63,8 @@ public class Document extends AbstractSemanticNode {
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {

View File

@ -148,7 +148,7 @@ public class EntityLogCreatorService {
}
private List<EntityLogEntry> createEntityLogEntries(Document document, AnalyzeRequest analyzeRequest, List<PrecursorEntity> notFoundPrecursorEntries, int analysisNumber) {
public List<EntityLogEntry> createEntityLogEntries(Document document, AnalyzeRequest analyzeRequest, List<PrecursorEntity> notFoundPrecursorEntries, int analysisNumber) {
String dossierTemplateId = analyzeRequest.getDossierTemplateId();

View File

@ -17,6 +17,7 @@ import java.util.stream.Collectors;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.google.common.collect.Sets;
import com.iqser.red.service.redaction.v1.server.model.ClosestEntity;
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
@ -26,7 +27,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionO
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
import lombok.extern.slf4j.Slf4j;
@ -36,14 +36,12 @@ import lombok.extern.slf4j.Slf4j;
public class EntityFindingUtility {
EntityCreationService entityCreationService;
DictionaryService dictionaryService;
@Autowired
public EntityFindingUtility(EntityEnrichmentService entityEnrichmentService, DictionaryService dictionaryService) {
public EntityFindingUtility(EntityEnrichmentService entityEnrichmentService) {
entityCreationService = new EntityCreationService(entityEnrichmentService);
this.dictionaryService = dictionaryService;
}
@ -172,12 +170,13 @@ public class EntityFindingUtility {
if (!pageNumbers.stream()
.allMatch(node::onPage)) {
throw new IllegalArgumentException(format("SemanticNode \"%s\" does not contain these pages %s, it has pages: %s",
throw new IllegalArgumentException(format("SemanticNode \"%s\" is missing pages %s",
node,
pageNumbers.stream()
.filter(pageNumber -> !node.onPage(pageNumber))
.toList(),
node.getPages()));
Sets.difference(pageNumbers,
node.getPages()
.stream()
.map(Page::getNumber)
.collect(Collectors.toSet()))));
}
SearchImplementation searchImplementation = new SearchImplementation(entryValues.stream()

View File

@ -0,0 +1,425 @@
package com.iqser.red.service.redaction.v1.server;
import static com.iqser.red.service.redaction.v1.server.service.document.EntityFromPrecursorCreationService.createCorrectEntity;
import static org.mockito.Mockito.when;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.FilterType;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.FileStatus;
import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLog;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryState;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryType;
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.service.EntityLogCreatorService;
import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper;
import com.iqser.red.service.redaction.v1.server.service.document.EntityFindingUtility;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Disabled
@Slf4j
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class)
public class MapEntityLogToDocumentDataTest extends AbstractRedactionIntegrationTest {
public static final int MATCH_THRESHOLD = 25;
ObjectMapper mapper = ObjectMapperFactory.create();
Path outputFolder = Path.of("/tmp/MIGRATED_ENTITY_LOGS");
@Autowired
RedactionServiceSettings redactionServiceSettings;
@Autowired
EntityFindingUtility entityFindingUtility;
@Autowired
EntityLogCreatorService entityLogCreatorService;
@Autowired
private DictionaryService dictionaryService;
private AtomicLong totalEntities;
private AtomicLong totalNotFoundEntities;
private AtomicLong totalRemovedDuplicates;
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
@Import({LayoutParsingServiceProcessorConfiguration.class})
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
public static class RedactionIntegrationTestConfiguration {
@Bean
@Primary
public StorageService inmemoryStorage() {
return new FileSystemBackedStorageService(ObjectMapperFactory.create());
}
}
@BeforeEach
public void stubClients() {
TenantContext.setTenantId("redaction");
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis());
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.COMPONENT)).thenReturn(-1L);
loadDictionaryForTest();
loadTypeForTest();
loadNerForTest();
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(0L);
when(dictionaryClient.getAllTypesForDossierTemplate(TEST_DOSSIER_TEMPLATE_ID, null, true)).thenReturn(getTemplateDictionaryTypeResponse());
when(dictionaryClient.getVersionForDossier(TEST_DOSSIER_ID)).thenReturn(0L);
when(dictionaryClient.getAllTypesForDossier(TEST_DOSSIER_ID, null, true)).thenReturn(getDossierDictionaryTypeResponse());
mockDictionaryCalls(null);
when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors);
dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
totalNotFoundEntities = new AtomicLong();
totalEntities = new AtomicLong();
totalRemovedDuplicates = new AtomicLong();
}
@Test
public void migrateEntityLog() {
Path entityLogPath = Path.of("/home/kschuettler/Dokumente/TestFiles/NER Dataset/1c4f5fa9a36d67d285f051f3a22813c4.ENTITY_LOG.json");
Path documentStructurePath = Path.of(
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_STRUCTURE.json");
Path documentTextPath = Path.of(
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_TEXT.json");
Path documentPositionsPath = Path.of(
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_POSITION.json");
Path documentPagesPath = Path.of(
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_PAGES.json");
Path outputPath = Path.of("/tmp/migratedEntityLog.json");
runForFile(entityLogPath, documentStructurePath, documentTextPath, documentPositionsPath, documentPagesPath, outputPath);
}
@Test
@SneakyThrows
public void migrateAllEntityLogs() {
Path entityLogFolder = Path.of("/home/kschuettler/Dokumente/TestFiles/NER Dataset/");
Path fileExchangeFolder = Path.of("/home/kschuettler/Downloads/New Folder (8)/");
Files.walk(fileExchangeFolder)
.filter(file -> file.getFileName().toString().endsWith("FILE_STATUS.json"))
.forEach(path -> {
FileStatus fileStatus = readFileStatus(path);
String originalFileName = fileStatus.getFilename().split("\\.")[0];
Path fileFolder = path.getParent();
runForFile(entityLogFolder.resolve(originalFileName + ".ENTITY_LOG.json"),
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_STRUCTURE.json"),
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_TEXT.json"),
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_POSITION.json"),
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_PAGES.json"),
outputFolder.resolve(fileStatus.getId() + ".ENTITY_LOG.json"));
});
log.info("All EntityLogs migrated and written to {}, did not find {} of {}, removed {} duplicate",
outputFolder,
totalNotFoundEntities.get(),
totalEntities.get(),
totalRemovedDuplicates.get());
}
@SneakyThrows
private FileStatus readFileStatus(Path path) {
try (var in = new FileInputStream(path.toFile())) {
return mapper.readValue(in, FileStatus.class);
}
}
private void runForFile(Path entityLogPath, Path documentStructurePath, Path documentTextPath, Path documentPositionsPath, Path documentPagesPath, Path outputPath) {
log.info("Starting entity log migration for file {} and entityLog {}", documentStructurePath, entityLogPath);
EntityLog entityLog = readEntityLog(entityLogPath);
Map<String, Long> originalCounts = entityLog.getEntityLogEntry()
.stream()
.filter(e -> !e.getState().equals(EntryState.REMOVED))
.collect(Collectors.groupingBy(EntityLogEntry::getType, Collectors.counting()));
Document document = readDocument(documentStructurePath, documentTextPath, documentPositionsPath, documentPagesPath);
List<PrecursorEntity> textPrecursorEntities = entityLog.getEntityLogEntry()
.stream()
.filter(entry -> !isImageOrArea(entry))
.map(this::precursorEntityWithState)
.toList();
List<EntityLogEntry> imageOrAreaEntries = entityLog.getEntityLogEntry()
.stream()
.filter(MapEntityLogToDocumentDataTest::isImageOrArea)
.toList();
List<PrecursorEntity> notFoundEntities = new LinkedList<>();
Map<String, List<TextEntity>> tempEntitiesByValue = entityFindingUtility.findAllPossibleEntitiesAndGroupByValue(document, textPrecursorEntities);
for (PrecursorEntity precursorEntity : textPrecursorEntities) {
Optional<TextEntity> optionalClosestEntity = entityFindingUtility.findClosestEntityAndReturnEmptyIfNotFound(precursorEntity, tempEntitiesByValue, MATCH_THRESHOLD);
if (optionalClosestEntity.isEmpty()) {
notFoundEntities.add(precursorEntity);
continue;
}
createCorrectEntity(precursorEntity, optionalClosestEntity.get(), false);
}
tempEntitiesByValue.values()
.stream()
.flatMap(Collection::stream)
.forEach(TextEntity::removeFromGraph);
List<EntityLogEntry> entityLogEntries = entityLogCreatorService.createEntityLogEntries(document,
buildTestAnalyzeRequest(),
notFoundEntities,
entityLog.getAnalysisNumber());
Map<String, EntityLogEntry> migratedEntityLogEntriesLookup = entityLogEntries.stream()
.collect(Collectors.toMap(EntityLogEntry::getId, Function.identity()));
addAreaOrImageEntitiesIfNotPresent(migratedEntityLogEntriesLookup, imageOrAreaEntries);
requireAllPreviousIdsPresent(entityLog, migratedEntityLogEntriesLookup);
for (EntityLogEntry entityLogEntry : entityLog.getEntityLogEntry()) {
if (entityLogEntry.getState().equals(EntryState.REMOVED)) {
continue;
}
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setManualChanges(entityLogEntry.getManualChanges());
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setChanges(entityLogEntry.getChanges());
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setReference(entityLogEntry.getReference());
if (entityLogEntry.getEntryType().equals(EntryType.IMAGE) || entityLogEntry.getEntryType().equals(EntryType.IMAGE_HINT)) {
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setState(entityLogEntry.getState());
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setReason(entityLogEntry.getReason());
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setLegalBasis(entityLogEntry.getLegalBasis());
}
}
logMigratedEntityLogDifference(entityLog, migratedEntityLogEntriesLookup);
entityLog.setEntityLogEntry(new LinkedList<>(migratedEntityLogEntriesLookup.values()));
Map<String, Long> migratedCounts = entityLog.getEntityLogEntry()
.stream()
.collect(Collectors.groupingBy(EntityLogEntry::getType, Collectors.counting()));
for (String type : originalCounts.keySet()) {
if (!migratedCounts.containsKey(type)) {
log.error("Type {} missing entirely", type);
continue;
}
long originalCount = originalCounts.get(type);
long migratedCount = migratedCounts.get(type);
if (originalCount != migratedCount) {
log.error("Type {} mismatch {} <-> {}", type, originalCount, migratedCount);
}
}
saveMigratedEntityLog(outputPath, entityLog);
log.info("Finished entityLog migration, did not find {} of {}", notFoundEntities.size(), migratedEntityLogEntriesLookup.size());
totalEntities.getAndAdd(migratedEntityLogEntriesLookup.size());
totalNotFoundEntities.getAndAdd(notFoundEntities.size());
}
private static void logMigratedEntityLogDifference(EntityLog entityLog, Map<String, EntityLogEntry> migratedEntityLogEntriesLookup) {
if (entityLog.getEntityLogEntry().size() != migratedEntityLogEntriesLookup.values().size()) {
if (Sets.difference(migratedEntityLogEntriesLookup.keySet(), buildIdsToMigrate(entityLog))
.stream()
.map(migratedEntityLogEntriesLookup::get)
.anyMatch(entry -> !isImageOrArea(entry))) {
log.error("Entity count mismatch {} <-> {}", entityLog.getEntityLogEntry().size(), migratedEntityLogEntriesLookup.values().size());
}
}
}
private static Set<String> buildIdsToMigrate(EntityLog entityLog) {
return entityLog.getEntityLogEntry()
.stream()
.filter(e -> e.getState() != EntryState.REMOVED)
.map(EntityLogEntry::getId)
.collect(Collectors.toSet());
}
private static void addAreaOrImageEntitiesIfNotPresent(Map<String, EntityLogEntry> entityLogEntries, List<EntityLogEntry> imageOrAreaEntries) {
for (EntityLogEntry imageOrAreaEntry : imageOrAreaEntries) {
if (entityLogEntries.containsKey(imageOrAreaEntry.getId())) {
continue;
}
entityLogEntries.put(imageOrAreaEntry.getId(), imageOrAreaEntry);
}
}
private static boolean isImageOrArea(EntityLogEntry entry) {
return entry.getEntryType().equals(EntryType.IMAGE) || entry.getEntryType().equals(EntryType.IMAGE_HINT) || entry.getEntryType().equals(EntryType.AREA);
}
private static void requireAllPreviousIdsPresent(EntityLog entityLog, Map<String, EntityLogEntry> migratedEntityLogEntriesLookup) {
Set<String> existingEntryIds = buildIdsToMigrate(entityLog);
Set<String> idDiff = Sets.difference(existingEntryIds, migratedEntityLogEntriesLookup.keySet());
if (!idDiff.isEmpty()) {
log.error("Missing ids: {} are missing ", idDiff);
throw new AssertionError(String.format("Missing ids: %s are missing ", idDiff));
}
}
private static AnalyzeRequest buildTestAnalyzeRequest() {
return AnalyzeRequest.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).fileId(TEST_FILE_ID).dossierId(TEST_DOSSIER_ID).build();
}
@SneakyThrows
private void saveMigratedEntityLog(Path outputPath, EntityLog entityLog) {
Files.createDirectories(outputPath.getParent());
try (var out = new FileOutputStream(outputPath.toFile())) {
mapper.writeValue(out, entityLog);
}
}
private PrecursorEntity precursorEntityWithState(EntityLogEntry entityLogEntry) {
PrecursorEntity precursorEntity = PrecursorEntity.fromEntityLogEntry(entityLogEntry);
switch (entityLogEntry.getState()) {
case APPLIED -> precursorEntity.apply(entityLogEntry.getMatchedRule(), entityLogEntry.getReason(), entityLogEntry.getLegalBasis());
case SKIPPED -> precursorEntity.skip(entityLogEntry.getMatchedRule(), entityLogEntry.getReason());
case IGNORED -> precursorEntity.ignore(entityLogEntry.getMatchedRule(), entityLogEntry.getReason());
case REMOVED, PENDING -> precursorEntity.remove(entityLogEntry.getMatchedRule(), entityLogEntry.getReason());
}
return precursorEntity;
}
@SneakyThrows
private Document readDocument(Path documentStructurePath, Path documentTextPath, Path documentPositionsPath, Path documentPagesPath) {
var documentDataBuilder = DocumentData.builder();
try (var in = new FileInputStream(documentStructurePath.toFile())) {
documentDataBuilder.documentStructure(mapper.readValue(in, DocumentStructure.class));
}
try (var in = new FileInputStream(documentTextPath.toFile())) {
documentDataBuilder.documentTextData(mapper.readValue(in, DocumentTextData[].class));
}
try (var in = new FileInputStream(documentPositionsPath.toFile())) {
documentDataBuilder.documentPositionData(mapper.readValue(in, DocumentPositionData[].class));
}
try (var in = new FileInputStream(documentPagesPath.toFile())) {
documentDataBuilder.documentPages(mapper.readValue(in, DocumentPage[].class));
}
return DocumentGraphMapper.toDocumentGraph(documentDataBuilder.build());
}
@SneakyThrows
private EntityLog readEntityLog(Path entityLogPath) {
try (var in = new FileInputStream(entityLogPath.toFile())) {
return removeDuplicates(mapper.readValue(in, EntityLog.class));
}
}
protected EntityLog removeDuplicates(EntityLog entityLog) {
int duplicateCount = 0;
HashMap<String, EntityLogEntry> existingAnnotationIds = new HashMap<>();
int size = entityLog.getEntityLogEntry().size();
List<EntityLogEntry> cleanedEntries = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
EntityLogEntry entry = entityLog.getEntityLogEntry()
.get(i);
if (!existingAnnotationIds.containsKey(entry.getId())) {
cleanedEntries.add(entry);
existingAnnotationIds.put(entry.getId(), entry);
} else {
duplicateCount++;
log.warn("Duplicate entry found for id {} \nExisting: {}\n Duplicate: {}", entry.getId(), existingAnnotationIds.get(entry.getId()), entry);
if (existingAnnotationIds.get(entry.getId()).getState().equals(EntryState.REMOVED) && !entry.getState().equals(EntryState.REMOVED)) {
cleanedEntries.remove(existingAnnotationIds.get(entry.getId()));
cleanedEntries.add(entry);
}
}
}
entityLog.setEntityLogEntry(cleanedEntries);
log.info("Removed {} duplicates", duplicateCount);
totalRemovedDuplicates.getAndAdd(duplicateCount);
return entityLog;
}
}