From 52866ea8f62df6c96698b1ece02c2ff5c4bbac89 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 17 Jul 2023 18:08:23 +0200 Subject: [PATCH] RED-7156: some files stuck in error state * fixed creation of entities with ManualRedactionEntries --- .../adapter/CustomEntityCreationAdapter.java | 255 ++++++++++-------- .../document/graph/nodes/SemanticNode.java | 11 + .../service/EntityRedactionService.java | 4 +- 3 files changed, 164 insertions(+), 106 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/CustomEntityCreationAdapter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/CustomEntityCreationAdapter.java index bc2c9ba7..ed5a4777 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/CustomEntityCreationAdapter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/CustomEntityCreationAdapter.java @@ -14,16 +14,14 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; - import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualRedactionEntry; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry; -import com.iqser.red.service.redaction.v1.server.exception.NotFoundException; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition; @@ -52,69 +50,95 @@ public class CustomEntityCreationAdapter { public Stream toRedactionEntity(RedactionLog redactionLog, SemanticNode node) { - List pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList(); - Set entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet()); + List entityIdentifiers = redactionLog.getRedactionLogEntry().stream().map(EntityIdentifier::fromRedactionLogEntry).toList(); + return toRedactionEntity(entityIdentifiers, node); + } + + + public Stream createRedactionEntities(Set manualRedactionEntries, SemanticNode node) { + + List entityIdentifiers = manualRedactionEntries.stream().map(EntityIdentifier::fromManualRedactionEntry).toList(); + return toRedactionEntity(entityIdentifiers, node); + } + + + private Stream toRedactionEntity(List entityIdentifiers, SemanticNode node) { + + Set pageNumbers = entityIdentifiers.stream().flatMap(entry -> entry.entityPosition().stream().map(RectangleWithPage::pageNumber)).collect(Collectors.toSet()); + Set entryValues = entityIdentifiers.stream().map(EntityIdentifier::value).map(String::toLowerCase).collect(Collectors.toSet()); Map> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValue(node, pageNumbers, entryValues); assert allValuesFound(tempEntitiesByValue, entryValues); - List entities = redactionLog.getRedactionLogEntry() - .stream() - .map(entry -> findClosestEntity(entry, tempEntitiesByValue).map(tempEntity -> createCorrectEntity(entry, node, tempEntity))) + List correctEntities = entityIdentifiers.stream() + .map(entityIdentifier -> findClosestEntity(entityIdentifier, tempEntitiesByValue).map(tempEntity -> createCorrectEntity(entityIdentifier, + node, + tempEntity.getBoundary()))) .filter(Optional::isPresent) .map(Optional::get) .toList(); tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph); - return entities.stream(); + return correctEntities.stream(); } - private Optional findClosestEntity(RedactionLogEntry entry, Map> tempEntitiesByValue) { + /** + * Deletes the temp Entity and creates a RedactionEntity with correct values, based on the given parameters. + * + * @param entityIdentifier The entity identifier for the RedactionEntity. + * @param node The SemanticNode associated with the RedactionEntity. + * @param closestBoundary The closest Boundary to the RedactionEntity. + * @return The created correct RedactionEntity. + */ + private RedactionEntity createCorrectEntity(EntityIdentifier entityIdentifier, SemanticNode node, Boundary closestBoundary) { - List possibleEntities = tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT)); + RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestBoundary, entityIdentifier.type(), entityIdentifier.entityType, node); + + if (entityIdentifier.redacted()) { + correctEntity.force(entityIdentifier.ruleIdentifier(), entityIdentifier.reason(), entityIdentifier.legalBasis()); + } else { + correctEntity.skip(entityIdentifier.ruleIdentifier(), entityIdentifier.reason()); + } + correctEntity.setDictionaryEntry(entityIdentifier.isDictionaryEntry()); + correctEntity.setDossierDictionaryEntry(entityIdentifier.isDossierDictionaryEntry()); + return correctEntity; + } + + + private Optional findClosestEntity(EntityIdentifier identifier, Map> entitiesWithSameValue) { + + List possibleEntities = entitiesWithSameValue.get(identifier.value().toLowerCase(Locale.ROOT)); if (possibleEntities == null || possibleEntities.isEmpty()) { - log.warn("Entity could not be created for manual add entry: {}, due to the string not being found.", entry); + log.warn("Entity could not be created with identifier: {}, due to the value {} not being found anywhere.", identifier, identifier.value()); return Optional.empty(); } - return findClosestRedactionEntity(entry.getPositions(), possibleEntities); + Optional optionalClosestEntity = possibleEntities.stream() + .filter(entity -> pagesMatch(entity, identifier.entityPosition())) + .min(Comparator.comparingDouble(entity -> calculateMinDistance(identifier.entityPosition(), entity))); + + if (optionalClosestEntity.isEmpty()) { + log.warn("No Entity with value {} found on page {}", identifier.value(), identifier.entityPosition()); + return optionalClosestEntity; + } + + RedactionEntity closestEntity = optionalClosestEntity.get(); + double distance = calculateMinDistance(identifier.entityPosition(), closestEntity); + if (distance > MATCH_THRESHOLD) { + log.warn(format("Distance to closest found entity is %.2f and therefore higher than the threshold of %.2f for \n%s \n%s", + distance, + MATCH_THRESHOLD, + identifier.entityPosition(), + closestEntity.getRedactionPositionsPerPage())); + return Optional.empty(); + } + + return Optional.of(closestEntity); } - public void createRedactionEntities(Set manualRedactionEntries, SemanticNode node) { - - List pageNumbers = manualRedactionEntries.stream() - .flatMap(entry -> entry.getPositions().stream().map(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle::getPage)) - .distinct() - .toList(); - Set entryValues = manualRedactionEntries.stream().map(ManualRedactionEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet()); - - Map> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValue(node, pageNumbers, entryValues); - - manualRedactionEntries.forEach(entry -> { - List possibleEntities = tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT)); - - if (possibleEntities == null || possibleEntities.isEmpty()) { - log.warn("Entity could not be created for manual add entry: {}, due to the string not being found.", entry); - return; - } - - List originalPositions = entry.getPositions().stream().map(CustomEntityCreationAdapter::toRectangle).toList(); - findClosestRedactionEntity(originalPositions, possibleEntities).ifPresent(closestEntity -> createCorrectEntity(entry, node, closestEntity)); - }); - - tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph); - } - - - private static Rectangle toRectangle(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) { - - return new Rectangle(new Point(rectangle.getTopLeftX(), rectangle.getTopLeftY()), rectangle.getWidth(), rectangle.getHeight(), rectangle.getPage()); - } - - - private Map> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, List pageNumbers, Set entryValues) { + private Map> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, Set pageNumbers, Set entryValues) { if (!pageNumbers.stream().allMatch(node::isOnPage)) { throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log", @@ -136,69 +160,22 @@ public class CustomEntityCreationAdapter { } - private Optional findClosestRedactionEntity(List originalPositions, List entitiesWithSameValue) { - - RedactionEntity closestEntity = entitiesWithSameValue.stream() - .filter(entity -> pagesMatch(entity, originalPositions)) - .min(Comparator.comparingDouble(entity -> calculateMinDistance(originalPositions, entity))) - .orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", originalPositions))); - - double distance = calculateMinDistance(originalPositions, closestEntity); - if (distance > MATCH_THRESHOLD) { - log.warn(format("Distance to closest found entity is %.2f and therefore higher than the threshold of %.2f for \n%s \n%s", - distance, - MATCH_THRESHOLD, - originalPositions, - closestEntity.getRedactionPositionsPerPage())); - return Optional.empty(); - } - - return Optional.of(closestEntity); - } - - - private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) { - - RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestEntity.getBoundary(), - redactionLogEntry.getType(), - redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY, - node); - String ruleIdentifier = redactionLogEntry.getType() + "." + redactionLogEntry.getMatchedRule() + ".0"; - if (redactionLogEntry.isRedacted()) { - correctEntity.apply(ruleIdentifier, redactionLogEntry.getReason(), redactionLogEntry.getLegalBasis()); - } else { - correctEntity.skip(ruleIdentifier, redactionLogEntry.getReason()); - } - correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry()); - correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry()); - return correctEntity; - } - - - private RedactionEntity createCorrectEntity(ManualRedactionEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) { - - RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestEntity.getBoundary(), redactionLogEntry.getType(), EntityType.ENTITY, node); - - correctEntity.force("MAN.0.0", redactionLogEntry.getReason(), redactionLogEntry.getLegalBasis()); - - return correctEntity; - } - - - private static boolean pagesMatch(RedactionEntity entity, List originalPositions) { + private static boolean pagesMatch(RedactionEntity entity, List originalPositions) { Set entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet()); - Set redactionLogEntryPageNumbers = originalPositions.stream().map(Rectangle::getPage).collect(Collectors.toSet()); - return entityPageNumbers.equals(redactionLogEntryPageNumbers); + Set originalPageNumbers = originalPositions.stream().map(RectangleWithPage::pageNumber).collect(Collectors.toSet()); + return entityPageNumbers.containsAll(originalPageNumbers); } - private double calculateMinDistance(List originalPositions, RedactionEntity entity) { + private double calculateMinDistance(List originalPositions, RedactionEntity entity) { if (originalPositions.size() != countRectangles(entity)) { return Double.MAX_VALUE; } - return originalPositions.stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum(); + return originalPositions.stream() + .mapToDouble(rectangleWithPage -> calculateMinDistancePerRectangle(entity, rectangleWithPage.pageNumber(), rectangleWithPage.rectangle2D())) + .sum(); } @@ -208,14 +185,14 @@ public class CustomEntityCreationAdapter { } - private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle originalRectangle) { + private double calculateMinDistancePerRectangle(RedactionEntity entity, int pageNumber, Rectangle2D originalRectangle) { return entity.getRedactionPositionsPerPage() .stream() - .filter(redactionPosition -> redactionPosition.getPage().getNumber() == originalRectangle.getPage()) + .filter(redactionPosition -> redactionPosition.getPage().getNumber() == pageNumber) .map(RedactionPosition::getRectanglePerLine) .flatMap(Collection::stream) - .mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(originalRectangle))) + .mapToDouble(rectangle -> calculateDistance(rectangle, originalRectangle)) .min() .orElse(Double.MAX_VALUE); } @@ -230,9 +207,77 @@ public class CustomEntityCreationAdapter { } - private Rectangle2D toRectangle2D(Rectangle rectangle) { + private record EntityIdentifier( + String value, + List entityPosition, + String ruleIdentifier, + String reason, + String legalBasis, + String type, + EntityType entityType, + boolean redacted, + boolean isDictionaryEntry, + boolean isDossierDictionaryEntry) { + + public static EntityIdentifier fromRedactionLogEntry(RedactionLogEntry redactionLogEntry) { + + String ruleIdentifier = redactionLogEntry.getType() + "." + redactionLogEntry.getMatchedRule() + ".0"; + List rectangleWithPages = redactionLogEntry.getPositions().stream().map(RectangleWithPage::fromRedactionLogRectangle).toList(); + return new EntityIdentifier(redactionLogEntry.getValue(), + rectangleWithPages, + ruleIdentifier, + redactionLogEntry.getReason(), + redactionLogEntry.getLegalBasis(), + redactionLogEntry.getType(), + redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY, + redactionLogEntry.isRedacted(), + redactionLogEntry.isDictionaryEntry(), + redactionLogEntry.isDossierDictionaryEntry()); + } + + + public static EntityIdentifier fromManualRedactionEntry(ManualRedactionEntry manualRedactionEntry) { + + List rectangleWithPages = manualRedactionEntry.getPositions().stream().map(RectangleWithPage::fromAnnotationRectangle).toList(); + return new EntityIdentifier(manualRedactionEntry.getValue(), + rectangleWithPages, + "MAN.0.0", + manualRedactionEntry.getReason(), + manualRedactionEntry.getLegalBasis(), + manualRedactionEntry.getType(), + EntityType.ENTITY, + true, + false, + false); + } + + } + + private record RectangleWithPage(int pageNumber, Rectangle2D rectangle2D) { + + public static RectangleWithPage fromRedactionLogRectangle(Rectangle rectangle) { + + return new RectangleWithPage(rectangle.getPage(), toRectangle2D(rectangle)); + } + + + public static RectangleWithPage fromAnnotationRectangle(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) { + + return new RectangleWithPage(rectangle.getPage(), toRectangle2D(rectangle)); + } + + + private static Rectangle2D toRectangle2D(Rectangle rectangle) { + + return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight()); + } + + + private static Rectangle2D toRectangle2D(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) { + + return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight()); + } - return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight()); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/SemanticNode.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/SemanticNode.java index fe9d14a1..80f3f047 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/SemanticNode.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/SemanticNode.java @@ -4,6 +4,7 @@ import static java.lang.String.format; import java.awt.geom.Rectangle2D; import java.util.Arrays; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -58,6 +59,16 @@ public interface SemanticNode { return getTextBlock().getPages(); } + /** + * Finds the first page associated with this Node + * + * @return Set of PageNodes this node appears on. + */ + default Page getFirstPage() { + + return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(); + } + /** * Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index db34af9c..8ecb6124 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -13,6 +13,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.CustomEntityCreationAdapter; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService; @@ -75,7 +76,8 @@ public class EntityRedactionService { public void addManualAddRedactionEntities(Set manualRedactionEntries, Document document) { - customEntityCreationAdapter.createRedactionEntities(manualRedactionEntries, document); + // Entities are automatically added to the DocumentGraph and don't need to be inserted again. + List entities = customEntityCreationAdapter.createRedactionEntities(manualRedactionEntries, document).toList(); }