Merge branch 'entitylog-migration' into 'master'
entitylog-mapping: add test to map entitylogs to existing DocumentData See merge request redactmanager/redaction-service!503
This commit is contained in:
commit
f3cdf46008
@ -63,8 +63,8 @@ public class Document extends AbstractSemanticNode {
|
||||
*
|
||||
* @return A list of main sections within the document
|
||||
* @deprecated This method is marked for removal.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
*/
|
||||
@Deprecated(forRemoval = true)
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
@ -148,7 +148,7 @@ public class EntityLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
private List<EntityLogEntry> createEntityLogEntries(Document document, AnalyzeRequest analyzeRequest, List<PrecursorEntity> notFoundPrecursorEntries, int analysisNumber) {
|
||||
public List<EntityLogEntry> createEntityLogEntries(Document document, AnalyzeRequest analyzeRequest, List<PrecursorEntity> notFoundPrecursorEntries, int analysisNumber) {
|
||||
|
||||
String dossierTemplateId = analyzeRequest.getDossierTemplateId();
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import com.iqser.red.service.redaction.v1.server.model.ClosestEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
|
||||
@ -26,7 +27,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionO
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -36,14 +36,12 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class EntityFindingUtility {
|
||||
|
||||
EntityCreationService entityCreationService;
|
||||
DictionaryService dictionaryService;
|
||||
|
||||
|
||||
@Autowired
|
||||
public EntityFindingUtility(EntityEnrichmentService entityEnrichmentService, DictionaryService dictionaryService) {
|
||||
public EntityFindingUtility(EntityEnrichmentService entityEnrichmentService) {
|
||||
|
||||
entityCreationService = new EntityCreationService(entityEnrichmentService);
|
||||
this.dictionaryService = dictionaryService;
|
||||
}
|
||||
|
||||
|
||||
@ -172,12 +170,13 @@ public class EntityFindingUtility {
|
||||
|
||||
if (!pageNumbers.stream()
|
||||
.allMatch(node::onPage)) {
|
||||
throw new IllegalArgumentException(format("SemanticNode \"%s\" does not contain these pages %s, it has pages: %s",
|
||||
throw new IllegalArgumentException(format("SemanticNode \"%s\" is missing pages %s",
|
||||
node,
|
||||
pageNumbers.stream()
|
||||
.filter(pageNumber -> !node.onPage(pageNumber))
|
||||
.toList(),
|
||||
node.getPages()));
|
||||
Sets.difference(pageNumbers,
|
||||
node.getPages()
|
||||
.stream()
|
||||
.map(Page::getNumber)
|
||||
.collect(Collectors.toSet()))));
|
||||
}
|
||||
|
||||
SearchImplementation searchImplementation = new SearchImplementation(entryValues.stream()
|
||||
|
||||
@ -0,0 +1,425 @@
|
||||
package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.service.document.EntityFromPrecursorCreationService.createCorrectEntity;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.FilterType;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.FileStatus;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLog;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogEntry;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryState;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentData;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.EntityLogCreatorService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.service.document.EntityFindingUtility;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Disabled
|
||||
@Slf4j
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(RedactionIntegrationTest.RedactionIntegrationTestConfiguration.class)
|
||||
public class MapEntityLogToDocumentDataTest extends AbstractRedactionIntegrationTest {
|
||||
|
||||
public static final int MATCH_THRESHOLD = 25;
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
Path outputFolder = Path.of("/tmp/MIGRATED_ENTITY_LOGS");
|
||||
|
||||
@Autowired
|
||||
RedactionServiceSettings redactionServiceSettings;
|
||||
@Autowired
|
||||
EntityFindingUtility entityFindingUtility;
|
||||
@Autowired
|
||||
EntityLogCreatorService entityLogCreatorService;
|
||||
@Autowired
|
||||
private DictionaryService dictionaryService;
|
||||
|
||||
private AtomicLong totalEntities;
|
||||
private AtomicLong totalNotFoundEntities;
|
||||
private AtomicLong totalRemovedDuplicates;
|
||||
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
@Import({LayoutParsingServiceProcessorConfiguration.class})
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
|
||||
return new FileSystemBackedStorageService(ObjectMapperFactory.create());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void stubClients() {
|
||||
|
||||
TenantContext.setTenantId("redaction");
|
||||
|
||||
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis());
|
||||
when(rulesClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.COMPONENT)).thenReturn(-1L);
|
||||
|
||||
loadDictionaryForTest();
|
||||
loadTypeForTest();
|
||||
loadNerForTest();
|
||||
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(0L);
|
||||
when(dictionaryClient.getAllTypesForDossierTemplate(TEST_DOSSIER_TEMPLATE_ID, null, true)).thenReturn(getTemplateDictionaryTypeResponse());
|
||||
|
||||
when(dictionaryClient.getVersionForDossier(TEST_DOSSIER_ID)).thenReturn(0L);
|
||||
when(dictionaryClient.getAllTypesForDossier(TEST_DOSSIER_ID, null, true)).thenReturn(getDossierDictionaryTypeResponse());
|
||||
|
||||
mockDictionaryCalls(null);
|
||||
|
||||
when(dictionaryClient.getColors(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(colors);
|
||||
dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
totalNotFoundEntities = new AtomicLong();
|
||||
totalEntities = new AtomicLong();
|
||||
totalRemovedDuplicates = new AtomicLong();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void migrateEntityLog() {
|
||||
|
||||
Path entityLogPath = Path.of("/home/kschuettler/Dokumente/TestFiles/NER Dataset/1c4f5fa9a36d67d285f051f3a22813c4.ENTITY_LOG.json");
|
||||
Path documentStructurePath = Path.of(
|
||||
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_STRUCTURE.json");
|
||||
Path documentTextPath = Path.of(
|
||||
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_TEXT.json");
|
||||
Path documentPositionsPath = Path.of(
|
||||
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_POSITION.json");
|
||||
Path documentPagesPath = Path.of(
|
||||
"/home/kschuettler/Downloads/New Folder (8)/8b9fcbce-2d25-4b68-b6ef-d5b8db3fc72e/0b6d3f3dcf4ebce9a9b080a1fed7fa12/0b6d3f3dcf4ebce9a9b080a1fed7fa12.DOCUMENT_PAGES.json");
|
||||
Path outputPath = Path.of("/tmp/migratedEntityLog.json");
|
||||
|
||||
runForFile(entityLogPath, documentStructurePath, documentTextPath, documentPositionsPath, documentPagesPath, outputPath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void migrateAllEntityLogs() {
|
||||
|
||||
Path entityLogFolder = Path.of("/home/kschuettler/Dokumente/TestFiles/NER Dataset/");
|
||||
Path fileExchangeFolder = Path.of("/home/kschuettler/Downloads/New Folder (8)/");
|
||||
Files.walk(fileExchangeFolder)
|
||||
.filter(file -> file.getFileName().toString().endsWith("FILE_STATUS.json"))
|
||||
.forEach(path -> {
|
||||
FileStatus fileStatus = readFileStatus(path);
|
||||
String originalFileName = fileStatus.getFilename().split("\\.")[0];
|
||||
Path fileFolder = path.getParent();
|
||||
runForFile(entityLogFolder.resolve(originalFileName + ".ENTITY_LOG.json"),
|
||||
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_STRUCTURE.json"),
|
||||
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_TEXT.json"),
|
||||
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_POSITION.json"),
|
||||
fileExchangeFolder.resolve(fileFolder).resolve(fileStatus.getId() + ".DOCUMENT_PAGES.json"),
|
||||
outputFolder.resolve(fileStatus.getId() + ".ENTITY_LOG.json"));
|
||||
});
|
||||
|
||||
log.info("All EntityLogs migrated and written to {}, did not find {} of {}, removed {} duplicate",
|
||||
outputFolder,
|
||||
totalNotFoundEntities.get(),
|
||||
totalEntities.get(),
|
||||
totalRemovedDuplicates.get());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private FileStatus readFileStatus(Path path) {
|
||||
|
||||
try (var in = new FileInputStream(path.toFile())) {
|
||||
return mapper.readValue(in, FileStatus.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void runForFile(Path entityLogPath, Path documentStructurePath, Path documentTextPath, Path documentPositionsPath, Path documentPagesPath, Path outputPath) {
|
||||
|
||||
log.info("Starting entity log migration for file {} and entityLog {}", documentStructurePath, entityLogPath);
|
||||
|
||||
EntityLog entityLog = readEntityLog(entityLogPath);
|
||||
|
||||
Map<String, Long> originalCounts = entityLog.getEntityLogEntry()
|
||||
.stream()
|
||||
.filter(e -> !e.getState().equals(EntryState.REMOVED))
|
||||
.collect(Collectors.groupingBy(EntityLogEntry::getType, Collectors.counting()));
|
||||
|
||||
Document document = readDocument(documentStructurePath, documentTextPath, documentPositionsPath, documentPagesPath);
|
||||
List<PrecursorEntity> textPrecursorEntities = entityLog.getEntityLogEntry()
|
||||
.stream()
|
||||
.filter(entry -> !isImageOrArea(entry))
|
||||
.map(this::precursorEntityWithState)
|
||||
.toList();
|
||||
|
||||
List<EntityLogEntry> imageOrAreaEntries = entityLog.getEntityLogEntry()
|
||||
.stream()
|
||||
.filter(MapEntityLogToDocumentDataTest::isImageOrArea)
|
||||
.toList();
|
||||
|
||||
List<PrecursorEntity> notFoundEntities = new LinkedList<>();
|
||||
Map<String, List<TextEntity>> tempEntitiesByValue = entityFindingUtility.findAllPossibleEntitiesAndGroupByValue(document, textPrecursorEntities);
|
||||
for (PrecursorEntity precursorEntity : textPrecursorEntities) {
|
||||
Optional<TextEntity> optionalClosestEntity = entityFindingUtility.findClosestEntityAndReturnEmptyIfNotFound(precursorEntity, tempEntitiesByValue, MATCH_THRESHOLD);
|
||||
if (optionalClosestEntity.isEmpty()) {
|
||||
notFoundEntities.add(precursorEntity);
|
||||
continue;
|
||||
}
|
||||
createCorrectEntity(precursorEntity, optionalClosestEntity.get(), false);
|
||||
}
|
||||
|
||||
tempEntitiesByValue.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.forEach(TextEntity::removeFromGraph);
|
||||
|
||||
List<EntityLogEntry> entityLogEntries = entityLogCreatorService.createEntityLogEntries(document,
|
||||
buildTestAnalyzeRequest(),
|
||||
notFoundEntities,
|
||||
entityLog.getAnalysisNumber());
|
||||
|
||||
Map<String, EntityLogEntry> migratedEntityLogEntriesLookup = entityLogEntries.stream()
|
||||
.collect(Collectors.toMap(EntityLogEntry::getId, Function.identity()));
|
||||
|
||||
addAreaOrImageEntitiesIfNotPresent(migratedEntityLogEntriesLookup, imageOrAreaEntries);
|
||||
|
||||
requireAllPreviousIdsPresent(entityLog, migratedEntityLogEntriesLookup);
|
||||
|
||||
for (EntityLogEntry entityLogEntry : entityLog.getEntityLogEntry()) {
|
||||
if (entityLogEntry.getState().equals(EntryState.REMOVED)) {
|
||||
continue;
|
||||
}
|
||||
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setManualChanges(entityLogEntry.getManualChanges());
|
||||
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setChanges(entityLogEntry.getChanges());
|
||||
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setReference(entityLogEntry.getReference());
|
||||
if (entityLogEntry.getEntryType().equals(EntryType.IMAGE) || entityLogEntry.getEntryType().equals(EntryType.IMAGE_HINT)) {
|
||||
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setState(entityLogEntry.getState());
|
||||
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setReason(entityLogEntry.getReason());
|
||||
migratedEntityLogEntriesLookup.get(entityLogEntry.getId()).setLegalBasis(entityLogEntry.getLegalBasis());
|
||||
}
|
||||
}
|
||||
|
||||
logMigratedEntityLogDifference(entityLog, migratedEntityLogEntriesLookup);
|
||||
|
||||
entityLog.setEntityLogEntry(new LinkedList<>(migratedEntityLogEntriesLookup.values()));
|
||||
|
||||
Map<String, Long> migratedCounts = entityLog.getEntityLogEntry()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(EntityLogEntry::getType, Collectors.counting()));
|
||||
|
||||
for (String type : originalCounts.keySet()) {
|
||||
if (!migratedCounts.containsKey(type)) {
|
||||
log.error("Type {} missing entirely", type);
|
||||
continue;
|
||||
}
|
||||
long originalCount = originalCounts.get(type);
|
||||
long migratedCount = migratedCounts.get(type);
|
||||
if (originalCount != migratedCount) {
|
||||
log.error("Type {} mismatch {} <-> {}", type, originalCount, migratedCount);
|
||||
}
|
||||
|
||||
}
|
||||
saveMigratedEntityLog(outputPath, entityLog);
|
||||
log.info("Finished entityLog migration, did not find {} of {}", notFoundEntities.size(), migratedEntityLogEntriesLookup.size());
|
||||
totalEntities.getAndAdd(migratedEntityLogEntriesLookup.size());
|
||||
totalNotFoundEntities.getAndAdd(notFoundEntities.size());
|
||||
}
|
||||
|
||||
|
||||
private static void logMigratedEntityLogDifference(EntityLog entityLog, Map<String, EntityLogEntry> migratedEntityLogEntriesLookup) {
|
||||
|
||||
if (entityLog.getEntityLogEntry().size() != migratedEntityLogEntriesLookup.values().size()) {
|
||||
if (Sets.difference(migratedEntityLogEntriesLookup.keySet(), buildIdsToMigrate(entityLog))
|
||||
.stream()
|
||||
.map(migratedEntityLogEntriesLookup::get)
|
||||
.anyMatch(entry -> !isImageOrArea(entry))) {
|
||||
|
||||
log.error("Entity count mismatch {} <-> {}", entityLog.getEntityLogEntry().size(), migratedEntityLogEntriesLookup.values().size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Set<String> buildIdsToMigrate(EntityLog entityLog) {
|
||||
|
||||
return entityLog.getEntityLogEntry()
|
||||
.stream()
|
||||
.filter(e -> e.getState() != EntryState.REMOVED)
|
||||
.map(EntityLogEntry::getId)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
|
||||
private static void addAreaOrImageEntitiesIfNotPresent(Map<String, EntityLogEntry> entityLogEntries, List<EntityLogEntry> imageOrAreaEntries) {
|
||||
|
||||
for (EntityLogEntry imageOrAreaEntry : imageOrAreaEntries) {
|
||||
|
||||
if (entityLogEntries.containsKey(imageOrAreaEntry.getId())) {
|
||||
continue;
|
||||
}
|
||||
entityLogEntries.put(imageOrAreaEntry.getId(), imageOrAreaEntry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean isImageOrArea(EntityLogEntry entry) {
|
||||
|
||||
return entry.getEntryType().equals(EntryType.IMAGE) || entry.getEntryType().equals(EntryType.IMAGE_HINT) || entry.getEntryType().equals(EntryType.AREA);
|
||||
}
|
||||
|
||||
|
||||
private static void requireAllPreviousIdsPresent(EntityLog entityLog, Map<String, EntityLogEntry> migratedEntityLogEntriesLookup) {
|
||||
|
||||
Set<String> existingEntryIds = buildIdsToMigrate(entityLog);
|
||||
|
||||
Set<String> idDiff = Sets.difference(existingEntryIds, migratedEntityLogEntriesLookup.keySet());
|
||||
if (!idDiff.isEmpty()) {
|
||||
log.error("Missing ids: {} are missing ", idDiff);
|
||||
throw new AssertionError(String.format("Missing ids: %s are missing ", idDiff));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static AnalyzeRequest buildTestAnalyzeRequest() {
|
||||
|
||||
return AnalyzeRequest.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).fileId(TEST_FILE_ID).dossierId(TEST_DOSSIER_ID).build();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void saveMigratedEntityLog(Path outputPath, EntityLog entityLog) {
|
||||
|
||||
Files.createDirectories(outputPath.getParent());
|
||||
try (var out = new FileOutputStream(outputPath.toFile())) {
|
||||
mapper.writeValue(out, entityLog);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private PrecursorEntity precursorEntityWithState(EntityLogEntry entityLogEntry) {
|
||||
|
||||
PrecursorEntity precursorEntity = PrecursorEntity.fromEntityLogEntry(entityLogEntry);
|
||||
switch (entityLogEntry.getState()) {
|
||||
case APPLIED -> precursorEntity.apply(entityLogEntry.getMatchedRule(), entityLogEntry.getReason(), entityLogEntry.getLegalBasis());
|
||||
case SKIPPED -> precursorEntity.skip(entityLogEntry.getMatchedRule(), entityLogEntry.getReason());
|
||||
case IGNORED -> precursorEntity.ignore(entityLogEntry.getMatchedRule(), entityLogEntry.getReason());
|
||||
case REMOVED, PENDING -> precursorEntity.remove(entityLogEntry.getMatchedRule(), entityLogEntry.getReason());
|
||||
}
|
||||
return precursorEntity;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Document readDocument(Path documentStructurePath, Path documentTextPath, Path documentPositionsPath, Path documentPagesPath) {
|
||||
|
||||
var documentDataBuilder = DocumentData.builder();
|
||||
try (var in = new FileInputStream(documentStructurePath.toFile())) {
|
||||
documentDataBuilder.documentStructure(mapper.readValue(in, DocumentStructure.class));
|
||||
}
|
||||
try (var in = new FileInputStream(documentTextPath.toFile())) {
|
||||
documentDataBuilder.documentTextData(mapper.readValue(in, DocumentTextData[].class));
|
||||
}
|
||||
try (var in = new FileInputStream(documentPositionsPath.toFile())) {
|
||||
documentDataBuilder.documentPositionData(mapper.readValue(in, DocumentPositionData[].class));
|
||||
}
|
||||
try (var in = new FileInputStream(documentPagesPath.toFile())) {
|
||||
documentDataBuilder.documentPages(mapper.readValue(in, DocumentPage[].class));
|
||||
}
|
||||
|
||||
return DocumentGraphMapper.toDocumentGraph(documentDataBuilder.build());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private EntityLog readEntityLog(Path entityLogPath) {
|
||||
|
||||
try (var in = new FileInputStream(entityLogPath.toFile())) {
|
||||
return removeDuplicates(mapper.readValue(in, EntityLog.class));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected EntityLog removeDuplicates(EntityLog entityLog) {
|
||||
|
||||
int duplicateCount = 0;
|
||||
|
||||
HashMap<String, EntityLogEntry> existingAnnotationIds = new HashMap<>();
|
||||
|
||||
int size = entityLog.getEntityLogEntry().size();
|
||||
List<EntityLogEntry> cleanedEntries = new ArrayList<>(size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
EntityLogEntry entry = entityLog.getEntityLogEntry()
|
||||
.get(i);
|
||||
if (!existingAnnotationIds.containsKey(entry.getId())) {
|
||||
cleanedEntries.add(entry);
|
||||
existingAnnotationIds.put(entry.getId(), entry);
|
||||
} else {
|
||||
duplicateCount++;
|
||||
log.warn("Duplicate entry found for id {} \nExisting: {}\n Duplicate: {}", entry.getId(), existingAnnotationIds.get(entry.getId()), entry);
|
||||
if (existingAnnotationIds.get(entry.getId()).getState().equals(EntryState.REMOVED) && !entry.getState().equals(EntryState.REMOVED)) {
|
||||
cleanedEntries.remove(existingAnnotationIds.get(entry.getId()));
|
||||
cleanedEntries.add(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
entityLog.setEntityLogEntry(cleanedEntries);
|
||||
log.info("Removed {} duplicates", duplicateCount);
|
||||
totalRemovedDuplicates.getAndAdd(duplicateCount);
|
||||
return entityLog;
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user