Merge branch 'RED-7834' into 'master'

RED-7834: fixes for migration

Closes RED-7834

See merge request redactmanager/redaction-service!225
This commit is contained in:
Timo Bejan 2023-12-11 15:44:41 +01:00
commit dc03894ea4
7 changed files with 113 additions and 34 deletions

View File

@ -16,6 +16,7 @@ import com.iqser.red.service.redaction.v1.model.MigrationRequest;
import com.iqser.red.service.redaction.v1.model.MigrationResponse;
import com.iqser.red.service.redaction.v1.server.model.MigratedEntityLog;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
@ -37,6 +38,7 @@ public class MigrationMessageReceiver {
LegacyRedactionLogMergeService legacyRedactionLogMergeService;
LegacyVersion0MigrationService legacyVersion0MigrationService;
RabbitTemplate rabbitTemplate;
DictionaryService dictionaryService;
@SneakyThrows
@ -45,7 +47,8 @@ public class MigrationMessageReceiver {
public void receiveMigrationRequest(Message message) {
MigrationRequest migrationRequest = objectMapper.readValue(message.getBody(), MigrationRequest.class);
log.info("--------------------------------------------------------------------");
log.info("Starting redactionLog to entityLog migration for dossierId {} and fileId {}", migrationRequest.getDossierId(), migrationRequest.getFileId());
// TODO: if an image is not found, try to copy the old one exactly (like with TextEntities)
Document document = DocumentGraphMapper.toDocumentGraph(redactionStorageService.getDocumentData(migrationRequest.getDossierId(), migrationRequest.getFileId()));
@ -57,12 +60,18 @@ public class MigrationMessageReceiver {
redactionLog = legacyRedactionLogMergeService.mergeManualChanges(redactionLog, migrationRequest.getManualRedactions(), migrationRequest.getDossierTemplateId());
}
dictionaryService.updateDictionary(migrationRequest.getDossierTemplateId(), migrationRequest.getDossierId());
MigratedEntityLog migratedEntityLog = redactionLogToEntityLogMigrationService.migrate(redactionLog, document);
redactionStorageService.storeObject(migrationRequest.getDossierId(), migrationRequest.getFileId(), FileType.ENTITY_LOG, migratedEntityLog.getEntityLog());
redactionStorageService.storeObject(migrationRequest.getDossierId(), migrationRequest.getFileId(), FileType.MIGRATED_IDS, migratedEntityLog.getMigratedIds());
sendFinished(MigrationResponse.builder().dossierId(migrationRequest.getDossierId()).fileId(migrationRequest.getFileId()).build());
log.info("Migrated {} redactionLog entries for dossierId {} and fileId {}",
migratedEntityLog.getEntityLog().getEntityLogEntry().size(),
migrationRequest.getDossierId(),
migrationRequest.getFileId());
log.info("");
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.migration;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedList;
@ -15,6 +16,8 @@ import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLog;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogLegalBasis;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.migration.MigratedIds;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogLegalBasis;
@ -26,10 +29,12 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.service.document.EntityCreationService;
import com.iqser.red.service.redaction.v1.server.service.document.EntityEnrichmentService;
import com.iqser.red.service.redaction.v1.server.service.document.EntityFindingUtility;
import com.iqser.red.service.redaction.v1.server.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.utils.MigratedIdsCollector;
import lombok.AccessLevel;
@ -66,7 +71,7 @@ public class RedactionLogToEntityLogMigrationService {
Map<String, String> oldToNewIDMapping = migratedIds.buildOldToNewMapping();
entityLog.setEntityLogEntry(entitiesToMigrate.stream().map(migrationEntity -> migrationEntity.toEntityLogEntry(oldToNewIDMapping)).toList());
if (redactionLog.getRedactionLogEntry().size() != entityLog.getEntityLogEntry().size()) {
if (getNumberOfApprovedEntries(redactionLog) != entityLog.getEntityLogEntry().size()) {
String message = String.format("Not all entities have been found during the migration redactionLog has %d entries and new entityLog %d",
redactionLog.getRedactionLogEntry().size(),
entityLog.getEntityLogEntry().size());
@ -78,6 +83,17 @@ public class RedactionLogToEntityLogMigrationService {
}
private static long getNumberOfApprovedEntries(RedactionLog redactionLog) {
return redactionLog.getRedactionLogEntry()
.stream()
.filter(redactionLogEntry -> redactionLogEntry.getManualChanges()
.stream()
.allMatch(manualChange -> manualChange.getAnnotationStatus().equals(AnnotationStatus.APPROVED)))
.count();
}
private List<MigrationEntity> calculateMigrationEntitiesFromRedactionLog(RedactionLog redactionLog, Document document) {
List<MigrationEntity> images = getImageBasedMigrationEntities(redactionLog, document);
@ -95,23 +111,32 @@ public class RedactionLogToEntityLogMigrationService {
private List<MigrationEntity> getImageBasedMigrationEntities(RedactionLog redactionLog, Document document) {
List<Image> images = document.streamAllImages().collect(Collectors.toList());
List<RedactionLogEntry> redactionLogImages = redactionLog.getRedactionLogEntry().stream().filter(RedactionLogEntry::isImage).toList();
List<RedactionLogEntry> redactionLogImages = redactionLog.getRedactionLogEntry()
.stream()
.filter(RedactionLogEntry::isImage)
.filter(redactionLogEntry -> redactionLogEntry.getManualChanges()
.stream()
.allMatch(manualChange -> manualChange.getAnnotationStatus().equals(AnnotationStatus.APPROVED)))
.toList();
List<MigrationEntity> migrationEntities = new LinkedList<>();
for (RedactionLogEntry redactionLogImage : redactionLogImages) {
List<RectangleWithPage> imagePositions = redactionLogImage.getPositions().stream().map(RectangleWithPage::fromRedactionLogRectangle).toList();
assert imagePositions.size() == 1;
Image closestImage = images.stream()
Optional<Image> optionalClosestImage = images.stream()
.filter(image -> image.onPage(redactionLogImage.getPositions().get(0).getPage()))
.min(Comparator.comparingDouble(image -> entityFindingUtility.calculateDistance(image.getPosition(), imagePositions.get(0).rectangle2D())))
.orElseThrow(() -> new RuntimeException("Image from redaction log not found: " + redactionLogImage));
.filter(image -> entityFindingUtility.calculateDistance(image.getPosition(), imagePositions.get(0).rectangle2D()) <= MATCH_THRESHOLD);
double minDistance = entityFindingUtility.calculateDistance(closestImage.getPosition(), imagePositions.get(0).rectangle2D());
if (minDistance > MATCH_THRESHOLD) {
throw new RuntimeException(String.format("Closest image has a distance of %.2f which is higher than the allowed %.2f", minDistance, MATCH_THRESHOLD));
Image closestImage;
if (optionalClosestImage.isEmpty()) { // if no fitting image can be found create a new one with the previous values!
closestImage = buildImageDirectly(document, redactionLogImage);
} else {
closestImage = optionalClosestImage.get();
images.remove(closestImage);
}
images.remove(closestImage);
String ruleIdentifier = "OLDIMG." + redactionLogImage.getMatchedRule() + ".0";
if (redactionLogImage.lastChangeIsRemoved()) {
closestImage.remove(ruleIdentifier, redactionLogImage.getReason());
@ -126,11 +151,36 @@ public class RedactionLogToEntityLogMigrationService {
}
private static Image buildImageDirectly(Document document, RedactionLogEntry redactionLogImage) {
Image image = Image.builder()
.documentTree(document.getDocumentTree())
.imageType(ImageType.fromString(redactionLogImage.getType()))
.transparent(redactionLogImage.isImageHasTransparency())
.page(document.getPages().stream().filter(p -> p.getNumber() == redactionLogImage.getPositions().get(0).getPage()).findFirst().orElseThrow())
.position(toRectangle2D(redactionLogImage.getPositions().get(0)))
.build();
List<Integer> treeId = document.getDocumentTree().createNewMainEntryAndReturnId(image);
image.setTreeId(treeId);
image.setId(IdBuilder.buildId(image.getPages(), image.getBBox().values().stream().toList(), "", ""));
return image;
}
private static Rectangle2D toRectangle2D(Rectangle rect) {
return new Rectangle2D.Double(rect.getTopLeft().getX(), rect.getTopLeft().getY(), rect.getWidth(), rect.getHeight());
}
private List<MigrationEntity> getTextBasedMigrationEntities(RedactionLog redactionLog, Document document) {
List<MigrationEntity> entitiesToMigrate = redactionLog.getRedactionLogEntry()
.stream()
.filter(redactionLogEntry -> !redactionLogEntry.isImage())
.filter(redactionLogEntry -> redactionLogEntry.getManualChanges()
.stream()
.allMatch(manualChange -> manualChange.getAnnotationStatus().equals(AnnotationStatus.APPROVED)))
.map(MigrationEntity::fromRedactionLogEntry)
.peek(migrationEntity -> {
if (migrationEntity.getRedactionLogEntry().lastChangeIsRemoved()) {

View File

@ -50,7 +50,7 @@ public final class MigrationEntity {
public static ManualEntity createManualEntity(RedactionLogEntry redactionLogEntry) {
String ruleIdentifier = "OLD." + redactionLogEntry.getMatchedRule() + ".0";
String ruleIdentifier = buildRuleIdentifier(redactionLogEntry);
List<RectangleWithPage> rectangleWithPages = redactionLogEntry.getPositions().stream().map(RectangleWithPage::fromRedactionLogRectangle).toList();
EntityType entityType = getEntityType(redactionLogEntry);
return ManualEntity.builder()
@ -72,6 +72,18 @@ public final class MigrationEntity {
}
private static String buildRuleIdentifier(RedactionLogEntry redactionLogEntry) {
String ruleIdentifier;
if (redactionLogEntry.getMatchedRule() != null) {
ruleIdentifier = "OLD." + redactionLogEntry.getMatchedRule() + ".0";
} else {
ruleIdentifier = "MAN.5.0"; // pure ManualRedactions used to have no matched rule
}
return ruleIdentifier;
}
private static EntityType getEntityType(RedactionLogEntry redactionLogEntry) {
if (redactionLogEntry.isRecommendation()) {
@ -212,7 +224,9 @@ public final class MigrationEntity {
.positions(List.of(new Position(image.getPosition(), image.getPage().getNumber())))
.containingNodeId(image.getTreeId())
.closestHeadline(image.getHeadline().getTextBlock().getSearchText())
.section(image.getManualOverwrite().getSection().orElse(image.getParent().toString()))
.section(redactionLogEntry.getSection())
.textAfter(redactionLogEntry.getTextAfter())
.textBefore(redactionLogEntry.getTextBefore())
.imageHasTransparency(image.isTransparent())
.state(buildEntryState(image))
.entryType(redactionLogEntry.isHint() ? EntryType.IMAGE_HINT : EntryType.IMAGE)
@ -232,14 +246,14 @@ public final class MigrationEntity {
.type(type)
.state(buildEntryState(manualEntity))
.entryType(buildEntryType(manualEntity))
.section(manualEntity.getManualOverwrite().getSection().orElse(manualEntity.getSection()))
.section(redactionLogEntry.getSection())
.textAfter(redactionLogEntry.getTextAfter())
.textBefore(redactionLogEntry.getTextBefore())
.containingNodeId(Collections.emptyList())
.closestHeadline("")
.matchedRule(manualEntity.getMatchedRule().getRuleIdentifier().toString())
.dictionaryEntry(manualEntity.isDictionaryEntry())
.dossierDictionaryEntry(manualEntity.isDossierDictionaryEntry())
.textAfter("")
.textBefore("")
.startOffset(-1)
.endOffset(-1)
.positions(manualEntity.getManualOverwrite()
@ -268,13 +282,13 @@ public final class MigrationEntity {
.legalBasis(entity.legalBasis())
.value(entity.getManualOverwrite().getValue().orElse(entity.getMatchedRule().isWriteValueWithLineBreaks() ? entity.getValueWithLineBreaks() : entity.getValue()))
.type(entity.getType())
.section(entity.getManualOverwrite().getSection().orElse(entity.getDeepestFullyContainingNode().toString()))
.section(redactionLogEntry.getSection())
.textAfter(redactionLogEntry.getTextAfter())
.textBefore(redactionLogEntry.getTextBefore())
.containingNodeId(entity.getDeepestFullyContainingNode().getTreeId())
.closestHeadline(entity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText())
.matchedRule(entity.getMatchedRule().getRuleIdentifier().toString())
.dictionaryEntry(entity.isDictionaryEntry())
.textAfter(entity.getTextAfter())
.textBefore(entity.getTextBefore())
.startOffset(entity.getTextRange().start())
.endOffset(entity.getTextRange().end())
.dossierDictionaryEntry(entity.isDossierDictionaryEntry())

View File

@ -9,6 +9,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -44,31 +45,35 @@ public class EntityFindingUtility {
}
public Optional<TextEntity> findClosestEntityAndReturnEmptyIfNotFound(ManualEntity identifier, Map<String, List<TextEntity>> entitiesWithSameValue, double matchThreshold) {
public Optional<TextEntity> findClosestEntityAndReturnEmptyIfNotFound(ManualEntity manualEntity, Map<String, List<TextEntity>> entitiesWithSameValue, double matchThreshold) {
List<TextEntity> possibleEntities = entitiesWithSameValue.get(identifier.getValue().toLowerCase(Locale.ENGLISH));
if (manualEntity.getValue() == null) {
return Optional.empty();
}
List<TextEntity> possibleEntities = entitiesWithSameValue.get(manualEntity.getValue().toLowerCase(Locale.ENGLISH));
if (entityIdentifierValueNotFound(possibleEntities)) {
log.warn("Entity could not be created with identifier: {}, due to the value {} not being found anywhere.", identifier, identifier.getValue());
log.warn("Entity could not be created with manualEntity: {}, due to the value {} not being found anywhere.", manualEntity, manualEntity.getValue());
return Optional.empty();
}
Optional<TextEntity> optionalClosestEntity = possibleEntities.stream()
.filter(entity -> pagesMatch(entity, identifier.getEntityPosition()))
.min(Comparator.comparingDouble(entity -> calculateMinDistance(identifier.getEntityPosition(), entity)));
.filter(entity -> pagesMatch(entity, manualEntity.getEntityPosition()))
.min(Comparator.comparingDouble(entity -> calculateMinDistance(manualEntity.getEntityPosition(), entity)));
if (optionalClosestEntity.isEmpty()) {
log.warn("No Entity with value {} found on page {}", identifier.getValue(), identifier.getEntityPosition());
log.warn("No Entity with value {} found on page {}", manualEntity.getValue(), manualEntity.getEntityPosition());
return Optional.empty();
}
TextEntity closestEntity = optionalClosestEntity.get();
double distance = calculateMinDistance(identifier.getEntityPosition(), closestEntity);
double distance = calculateMinDistance(manualEntity.getEntityPosition(), closestEntity);
if (distance > matchThreshold) {
log.warn("For entity {} on page {} with positions {} distance to closest found entity is {} and therefore higher than the threshold of {}",
identifier.getValue(),
identifier.getEntityPosition().get(0).pageNumber(),
identifier.getEntityPosition().stream().map(RectangleWithPage::rectangle2D).toList(),
manualEntity.getValue(),
manualEntity.getEntityPosition().get(0).pageNumber(),
manualEntity.getEntityPosition().stream().map(RectangleWithPage::rectangle2D).toList(),
distance,
matchThreshold);
return Optional.empty();
@ -99,7 +104,8 @@ public class EntityFindingUtility {
}
return originalPositions.stream()
.mapToDouble(rectangleWithPage -> calculateMinDistancePerRectangle(entity, rectangleWithPage.pageNumber(), rectangleWithPage.rectangle2D()))
.sum();
.average()
.orElse(Double.MAX_VALUE);
}
@ -145,7 +151,7 @@ public class EntityFindingUtility {
public Map<String, List<TextEntity>> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, List<ManualEntity> manualEntities) {
Set<Integer> pageNumbers = manualEntities.stream().flatMap(entry -> entry.getEntityPosition().stream().map(RectangleWithPage::pageNumber)).collect(Collectors.toSet());
Set<String> entryValues = manualEntities.stream().map(ManualEntity::getValue).map(String::toLowerCase).collect(Collectors.toSet());
Set<String> entryValues = manualEntities.stream().map(ManualEntity::getValue).filter(Objects::nonNull).map(String::toLowerCase).collect(Collectors.toSet());
if (!pageNumbers.stream().allMatch(node::onPage)) {
throw new IllegalArgumentException(format("SemanticNode \"%s\" does not contain these pages %s, it has pages: %s",

View File

@ -29,7 +29,7 @@ import lombok.extern.slf4j.Slf4j;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ManualEntityCreationService {
static double MATCH_THRESHOLD = 5; // Is compared to the sum of distances in pdf coordinates for each corner of the bounding box of the entities
static double MATCH_THRESHOLD = 10; // Is compared to the average sum of distances in pdf coordinates for each corner of the bounding box of the entities
EntityFindingUtility entityFindingUtility;
EntityCreationService entityCreationService;
DictionaryService dictionaryService;

View File

@ -95,7 +95,7 @@ public class RedactionStorageService {
RedactionLog redactionLog = storageService.readJSONObject(TenantContext.getTenantId(),
StorageIdUtils.getStorageId(dossierId, fileId, FileType.REDACTION_LOG),
RedactionLog.class);
redactionLog.setRedactionLogEntry(redactionLog.getRedactionLogEntry().stream().filter(entry -> !(entry.getValue() == null || entry.getValue().isEmpty())).collect(Collectors.toList()));
redactionLog.setRedactionLogEntry(redactionLog.getRedactionLogEntry().stream().filter(entry -> !(entry.getPositions() == null || entry.getPositions().isEmpty())).collect(Collectors.toList()));
return redactionLog;
} catch (StorageObjectDoesNotExist e) {
log.debug("RedactionLog not available.");
@ -110,7 +110,7 @@ public class RedactionStorageService {
try {
EntityLog entityLog = storageService.readJSONObject(TenantContext.getTenantId(), StorageIdUtils.getStorageId(dossierId, fileId, FileType.ENTITY_LOG), EntityLog.class);
entityLog.setEntityLogEntry(entityLog.getEntityLogEntry().stream().filter(entry -> !entry.getValue().isEmpty()).collect(Collectors.toList()));
entityLog.setEntityLogEntry(entityLog.getEntityLogEntry().stream().filter(entry -> !(entry.getPositions() == null || entry.getPositions().isEmpty())).collect(Collectors.toList()));
return entityLog;
} catch (StorageObjectDoesNotExist e) {
log.debug("EntityLog not available.");

View File

@ -1,5 +1,5 @@
server:
port: 8077
port: 8083
persistence-service.url: "http://localhost:8085"
tenant-user-management-service.url: "http://localhost:8091/internal"