RED-7156: some files stuck in error state

* fixed creation of entities with ManualRedactionEntries
This commit is contained in:
Kilian Schuettler 2023-07-17 18:08:23 +02:00
parent 091895044a
commit 52866ea8f6
3 changed files with 164 additions and 106 deletions

View File

@ -14,16 +14,14 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualRedactionEntry;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition;
@ -52,69 +50,95 @@ public class CustomEntityCreationAdapter {
public Stream<RedactionEntity> toRedactionEntity(RedactionLog redactionLog, SemanticNode node) {
List<Integer> pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList();
Set<String> entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
List<EntityIdentifier> entityIdentifiers = redactionLog.getRedactionLogEntry().stream().map(EntityIdentifier::fromRedactionLogEntry).toList();
return toRedactionEntity(entityIdentifiers, node);
}
public Stream<RedactionEntity> createRedactionEntities(Set<ManualRedactionEntry> manualRedactionEntries, SemanticNode node) {
List<EntityIdentifier> entityIdentifiers = manualRedactionEntries.stream().map(EntityIdentifier::fromManualRedactionEntry).toList();
return toRedactionEntity(entityIdentifiers, node);
}
private Stream<RedactionEntity> toRedactionEntity(List<EntityIdentifier> entityIdentifiers, SemanticNode node) {
Set<Integer> pageNumbers = entityIdentifiers.stream().flatMap(entry -> entry.entityPosition().stream().map(RectangleWithPage::pageNumber)).collect(Collectors.toSet());
Set<String> entryValues = entityIdentifiers.stream().map(EntityIdentifier::value).map(String::toLowerCase).collect(Collectors.toSet());
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValue(node, pageNumbers, entryValues);
assert allValuesFound(tempEntitiesByValue, entryValues);
List<RedactionEntity> entities = redactionLog.getRedactionLogEntry()
.stream()
.map(entry -> findClosestEntity(entry, tempEntitiesByValue).map(tempEntity -> createCorrectEntity(entry, node, tempEntity)))
List<RedactionEntity> correctEntities = entityIdentifiers.stream()
.map(entityIdentifier -> findClosestEntity(entityIdentifier, tempEntitiesByValue).map(tempEntity -> createCorrectEntity(entityIdentifier,
node,
tempEntity.getBoundary())))
.filter(Optional::isPresent)
.map(Optional::get)
.toList();
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
return entities.stream();
return correctEntities.stream();
}
private Optional<RedactionEntity> findClosestEntity(RedactionLogEntry entry, Map<String, List<RedactionEntity>> tempEntitiesByValue) {
/**
* Deletes the temp Entity and creates a RedactionEntity with correct values, based on the given parameters.
*
* @param entityIdentifier The entity identifier for the RedactionEntity.
* @param node The SemanticNode associated with the RedactionEntity.
* @param closestBoundary The closest Boundary to the RedactionEntity.
* @return The created correct RedactionEntity.
*/
private RedactionEntity createCorrectEntity(EntityIdentifier entityIdentifier, SemanticNode node, Boundary closestBoundary) {
List<RedactionEntity> possibleEntities = tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT));
RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestBoundary, entityIdentifier.type(), entityIdentifier.entityType, node);
if (entityIdentifier.redacted()) {
correctEntity.force(entityIdentifier.ruleIdentifier(), entityIdentifier.reason(), entityIdentifier.legalBasis());
} else {
correctEntity.skip(entityIdentifier.ruleIdentifier(), entityIdentifier.reason());
}
correctEntity.setDictionaryEntry(entityIdentifier.isDictionaryEntry());
correctEntity.setDossierDictionaryEntry(entityIdentifier.isDossierDictionaryEntry());
return correctEntity;
}
private Optional<RedactionEntity> findClosestEntity(EntityIdentifier identifier, Map<String, List<RedactionEntity>> entitiesWithSameValue) {
List<RedactionEntity> possibleEntities = entitiesWithSameValue.get(identifier.value().toLowerCase(Locale.ROOT));
if (possibleEntities == null || possibleEntities.isEmpty()) {
log.warn("Entity could not be created for manual add entry: {}, due to the string not being found.", entry);
log.warn("Entity could not be created with identifier: {}, due to the value {} not being found anywhere.", identifier, identifier.value());
return Optional.empty();
}
return findClosestRedactionEntity(entry.getPositions(), possibleEntities);
Optional<RedactionEntity> optionalClosestEntity = possibleEntities.stream()
.filter(entity -> pagesMatch(entity, identifier.entityPosition()))
.min(Comparator.comparingDouble(entity -> calculateMinDistance(identifier.entityPosition(), entity)));
if (optionalClosestEntity.isEmpty()) {
log.warn("No Entity with value {} found on page {}", identifier.value(), identifier.entityPosition());
return optionalClosestEntity;
}
RedactionEntity closestEntity = optionalClosestEntity.get();
double distance = calculateMinDistance(identifier.entityPosition(), closestEntity);
if (distance > MATCH_THRESHOLD) {
log.warn(format("Distance to closest found entity is %.2f and therefore higher than the threshold of %.2f for \n%s \n%s",
distance,
MATCH_THRESHOLD,
identifier.entityPosition(),
closestEntity.getRedactionPositionsPerPage()));
return Optional.empty();
}
return Optional.of(closestEntity);
}
public void createRedactionEntities(Set<ManualRedactionEntry> manualRedactionEntries, SemanticNode node) {
List<Integer> pageNumbers = manualRedactionEntries.stream()
.flatMap(entry -> entry.getPositions().stream().map(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle::getPage))
.distinct()
.toList();
Set<String> entryValues = manualRedactionEntries.stream().map(ManualRedactionEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValue(node, pageNumbers, entryValues);
manualRedactionEntries.forEach(entry -> {
List<RedactionEntity> possibleEntities = tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT));
if (possibleEntities == null || possibleEntities.isEmpty()) {
log.warn("Entity could not be created for manual add entry: {}, due to the string not being found.", entry);
return;
}
List<Rectangle> originalPositions = entry.getPositions().stream().map(CustomEntityCreationAdapter::toRectangle).toList();
findClosestRedactionEntity(originalPositions, possibleEntities).ifPresent(closestEntity -> createCorrectEntity(entry, node, closestEntity));
});
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
}
private static Rectangle toRectangle(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) {
return new Rectangle(new Point(rectangle.getTopLeftX(), rectangle.getTopLeftY()), rectangle.getWidth(), rectangle.getHeight(), rectangle.getPage());
}
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, List<Integer> pageNumbers, Set<String> entryValues) {
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, Set<Integer> pageNumbers, Set<String> entryValues) {
if (!pageNumbers.stream().allMatch(node::isOnPage)) {
throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log",
@ -136,69 +160,22 @@ public class CustomEntityCreationAdapter {
}
private Optional<RedactionEntity> findClosestRedactionEntity(List<Rectangle> originalPositions, List<RedactionEntity> entitiesWithSameValue) {
RedactionEntity closestEntity = entitiesWithSameValue.stream()
.filter(entity -> pagesMatch(entity, originalPositions))
.min(Comparator.comparingDouble(entity -> calculateMinDistance(originalPositions, entity)))
.orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", originalPositions)));
double distance = calculateMinDistance(originalPositions, closestEntity);
if (distance > MATCH_THRESHOLD) {
log.warn(format("Distance to closest found entity is %.2f and therefore higher than the threshold of %.2f for \n%s \n%s",
distance,
MATCH_THRESHOLD,
originalPositions,
closestEntity.getRedactionPositionsPerPage()));
return Optional.empty();
}
return Optional.of(closestEntity);
}
private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestEntity.getBoundary(),
redactionLogEntry.getType(),
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
node);
String ruleIdentifier = redactionLogEntry.getType() + "." + redactionLogEntry.getMatchedRule() + ".0";
if (redactionLogEntry.isRedacted()) {
correctEntity.apply(ruleIdentifier, redactionLogEntry.getReason(), redactionLogEntry.getLegalBasis());
} else {
correctEntity.skip(ruleIdentifier, redactionLogEntry.getReason());
}
correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry());
correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry());
return correctEntity;
}
private RedactionEntity createCorrectEntity(ManualRedactionEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestEntity.getBoundary(), redactionLogEntry.getType(), EntityType.ENTITY, node);
correctEntity.force("MAN.0.0", redactionLogEntry.getReason(), redactionLogEntry.getLegalBasis());
return correctEntity;
}
private static boolean pagesMatch(RedactionEntity entity, List<Rectangle> originalPositions) {
private static boolean pagesMatch(RedactionEntity entity, List<RectangleWithPage> originalPositions) {
Set<Integer> entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet());
Set<Integer> redactionLogEntryPageNumbers = originalPositions.stream().map(Rectangle::getPage).collect(Collectors.toSet());
return entityPageNumbers.equals(redactionLogEntryPageNumbers);
Set<Integer> originalPageNumbers = originalPositions.stream().map(RectangleWithPage::pageNumber).collect(Collectors.toSet());
return entityPageNumbers.containsAll(originalPageNumbers);
}
private double calculateMinDistance(List<Rectangle> originalPositions, RedactionEntity entity) {
private double calculateMinDistance(List<RectangleWithPage> originalPositions, RedactionEntity entity) {
if (originalPositions.size() != countRectangles(entity)) {
return Double.MAX_VALUE;
}
return originalPositions.stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum();
return originalPositions.stream()
.mapToDouble(rectangleWithPage -> calculateMinDistancePerRectangle(entity, rectangleWithPage.pageNumber(), rectangleWithPage.rectangle2D()))
.sum();
}
@ -208,14 +185,14 @@ public class CustomEntityCreationAdapter {
}
private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle originalRectangle) {
private double calculateMinDistancePerRectangle(RedactionEntity entity, int pageNumber, Rectangle2D originalRectangle) {
return entity.getRedactionPositionsPerPage()
.stream()
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == originalRectangle.getPage())
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == pageNumber)
.map(RedactionPosition::getRectanglePerLine)
.flatMap(Collection::stream)
.mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(originalRectangle)))
.mapToDouble(rectangle -> calculateDistance(rectangle, originalRectangle))
.min()
.orElse(Double.MAX_VALUE);
}
@ -230,9 +207,77 @@ public class CustomEntityCreationAdapter {
}
private Rectangle2D toRectangle2D(Rectangle rectangle) {
private record EntityIdentifier(
String value,
List<RectangleWithPage> entityPosition,
String ruleIdentifier,
String reason,
String legalBasis,
String type,
EntityType entityType,
boolean redacted,
boolean isDictionaryEntry,
boolean isDossierDictionaryEntry) {
public static EntityIdentifier fromRedactionLogEntry(RedactionLogEntry redactionLogEntry) {
String ruleIdentifier = redactionLogEntry.getType() + "." + redactionLogEntry.getMatchedRule() + ".0";
List<RectangleWithPage> rectangleWithPages = redactionLogEntry.getPositions().stream().map(RectangleWithPage::fromRedactionLogRectangle).toList();
return new EntityIdentifier(redactionLogEntry.getValue(),
rectangleWithPages,
ruleIdentifier,
redactionLogEntry.getReason(),
redactionLogEntry.getLegalBasis(),
redactionLogEntry.getType(),
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
redactionLogEntry.isRedacted(),
redactionLogEntry.isDictionaryEntry(),
redactionLogEntry.isDossierDictionaryEntry());
}
public static EntityIdentifier fromManualRedactionEntry(ManualRedactionEntry manualRedactionEntry) {
List<RectangleWithPage> rectangleWithPages = manualRedactionEntry.getPositions().stream().map(RectangleWithPage::fromAnnotationRectangle).toList();
return new EntityIdentifier(manualRedactionEntry.getValue(),
rectangleWithPages,
"MAN.0.0",
manualRedactionEntry.getReason(),
manualRedactionEntry.getLegalBasis(),
manualRedactionEntry.getType(),
EntityType.ENTITY,
true,
false,
false);
}
}
private record RectangleWithPage(int pageNumber, Rectangle2D rectangle2D) {
public static RectangleWithPage fromRedactionLogRectangle(Rectangle rectangle) {
return new RectangleWithPage(rectangle.getPage(), toRectangle2D(rectangle));
}
public static RectangleWithPage fromAnnotationRectangle(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) {
return new RectangleWithPage(rectangle.getPage(), toRectangle2D(rectangle));
}
private static Rectangle2D toRectangle2D(Rectangle rectangle) {
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
}
private static Rectangle2D toRectangle2D(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) {
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
}
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
}
}

View File

@ -4,6 +4,7 @@ import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -58,6 +59,16 @@ public interface SemanticNode {
return getTextBlock().getPages();
}
/**
* Finds the first page associated with this Node
*
* @return Set of PageNodes this node appears on.
*/
default Page getFirstPage() {
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow();
}
/**
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.

View File

@ -13,6 +13,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.CustomEntityCreationAdapter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService;
@ -75,7 +76,8 @@ public class EntityRedactionService {
public void addManualAddRedactionEntities(Set<ManualRedactionEntry> manualRedactionEntries, Document document) {
customEntityCreationAdapter.createRedactionEntities(manualRedactionEntries, document);
// Entities are automatically added to the DocumentGraph and don't need to be inserted again.
List<RedactionEntity> entities = customEntityCreationAdapter.createRedactionEntities(manualRedactionEntries, document).toList();
}