RED-7156: some files stuck in error state
* fixed creation of entities with ManualRedactionEntries
This commit is contained in:
parent
091895044a
commit
52866ea8f6
@ -14,16 +14,14 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.ManualRedactionEntry;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.NotFoundException;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition;
|
||||
@ -52,69 +50,95 @@ public class CustomEntityCreationAdapter {
|
||||
|
||||
public Stream<RedactionEntity> toRedactionEntity(RedactionLog redactionLog, SemanticNode node) {
|
||||
|
||||
List<Integer> pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList();
|
||||
Set<String> entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
|
||||
List<EntityIdentifier> entityIdentifiers = redactionLog.getRedactionLogEntry().stream().map(EntityIdentifier::fromRedactionLogEntry).toList();
|
||||
return toRedactionEntity(entityIdentifiers, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> createRedactionEntities(Set<ManualRedactionEntry> manualRedactionEntries, SemanticNode node) {
|
||||
|
||||
List<EntityIdentifier> entityIdentifiers = manualRedactionEntries.stream().map(EntityIdentifier::fromManualRedactionEntry).toList();
|
||||
return toRedactionEntity(entityIdentifiers, node);
|
||||
}
|
||||
|
||||
|
||||
private Stream<RedactionEntity> toRedactionEntity(List<EntityIdentifier> entityIdentifiers, SemanticNode node) {
|
||||
|
||||
Set<Integer> pageNumbers = entityIdentifiers.stream().flatMap(entry -> entry.entityPosition().stream().map(RectangleWithPage::pageNumber)).collect(Collectors.toSet());
|
||||
Set<String> entryValues = entityIdentifiers.stream().map(EntityIdentifier::value).map(String::toLowerCase).collect(Collectors.toSet());
|
||||
|
||||
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValue(node, pageNumbers, entryValues);
|
||||
assert allValuesFound(tempEntitiesByValue, entryValues);
|
||||
|
||||
List<RedactionEntity> entities = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.map(entry -> findClosestEntity(entry, tempEntitiesByValue).map(tempEntity -> createCorrectEntity(entry, node, tempEntity)))
|
||||
List<RedactionEntity> correctEntities = entityIdentifiers.stream()
|
||||
.map(entityIdentifier -> findClosestEntity(entityIdentifier, tempEntitiesByValue).map(tempEntity -> createCorrectEntity(entityIdentifier,
|
||||
node,
|
||||
tempEntity.getBoundary())))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.toList();
|
||||
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
|
||||
return entities.stream();
|
||||
return correctEntities.stream();
|
||||
}
|
||||
|
||||
|
||||
private Optional<RedactionEntity> findClosestEntity(RedactionLogEntry entry, Map<String, List<RedactionEntity>> tempEntitiesByValue) {
|
||||
/**
|
||||
* Deletes the temp Entity and creates a RedactionEntity with correct values, based on the given parameters.
|
||||
*
|
||||
* @param entityIdentifier The entity identifier for the RedactionEntity.
|
||||
* @param node The SemanticNode associated with the RedactionEntity.
|
||||
* @param closestBoundary The closest Boundary to the RedactionEntity.
|
||||
* @return The created correct RedactionEntity.
|
||||
*/
|
||||
private RedactionEntity createCorrectEntity(EntityIdentifier entityIdentifier, SemanticNode node, Boundary closestBoundary) {
|
||||
|
||||
List<RedactionEntity> possibleEntities = tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT));
|
||||
RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestBoundary, entityIdentifier.type(), entityIdentifier.entityType, node);
|
||||
|
||||
if (entityIdentifier.redacted()) {
|
||||
correctEntity.force(entityIdentifier.ruleIdentifier(), entityIdentifier.reason(), entityIdentifier.legalBasis());
|
||||
} else {
|
||||
correctEntity.skip(entityIdentifier.ruleIdentifier(), entityIdentifier.reason());
|
||||
}
|
||||
correctEntity.setDictionaryEntry(entityIdentifier.isDictionaryEntry());
|
||||
correctEntity.setDossierDictionaryEntry(entityIdentifier.isDossierDictionaryEntry());
|
||||
return correctEntity;
|
||||
}
|
||||
|
||||
|
||||
private Optional<RedactionEntity> findClosestEntity(EntityIdentifier identifier, Map<String, List<RedactionEntity>> entitiesWithSameValue) {
|
||||
|
||||
List<RedactionEntity> possibleEntities = entitiesWithSameValue.get(identifier.value().toLowerCase(Locale.ROOT));
|
||||
|
||||
if (possibleEntities == null || possibleEntities.isEmpty()) {
|
||||
log.warn("Entity could not be created for manual add entry: {}, due to the string not being found.", entry);
|
||||
log.warn("Entity could not be created with identifier: {}, due to the value {} not being found anywhere.", identifier, identifier.value());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return findClosestRedactionEntity(entry.getPositions(), possibleEntities);
|
||||
Optional<RedactionEntity> optionalClosestEntity = possibleEntities.stream()
|
||||
.filter(entity -> pagesMatch(entity, identifier.entityPosition()))
|
||||
.min(Comparator.comparingDouble(entity -> calculateMinDistance(identifier.entityPosition(), entity)));
|
||||
|
||||
if (optionalClosestEntity.isEmpty()) {
|
||||
log.warn("No Entity with value {} found on page {}", identifier.value(), identifier.entityPosition());
|
||||
return optionalClosestEntity;
|
||||
}
|
||||
|
||||
RedactionEntity closestEntity = optionalClosestEntity.get();
|
||||
double distance = calculateMinDistance(identifier.entityPosition(), closestEntity);
|
||||
if (distance > MATCH_THRESHOLD) {
|
||||
log.warn(format("Distance to closest found entity is %.2f and therefore higher than the threshold of %.2f for \n%s \n%s",
|
||||
distance,
|
||||
MATCH_THRESHOLD,
|
||||
identifier.entityPosition(),
|
||||
closestEntity.getRedactionPositionsPerPage()));
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(closestEntity);
|
||||
}
|
||||
|
||||
|
||||
public void createRedactionEntities(Set<ManualRedactionEntry> manualRedactionEntries, SemanticNode node) {
|
||||
|
||||
List<Integer> pageNumbers = manualRedactionEntries.stream()
|
||||
.flatMap(entry -> entry.getPositions().stream().map(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle::getPage))
|
||||
.distinct()
|
||||
.toList();
|
||||
Set<String> entryValues = manualRedactionEntries.stream().map(ManualRedactionEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet());
|
||||
|
||||
Map<String, List<RedactionEntity>> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValue(node, pageNumbers, entryValues);
|
||||
|
||||
manualRedactionEntries.forEach(entry -> {
|
||||
List<RedactionEntity> possibleEntities = tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT));
|
||||
|
||||
if (possibleEntities == null || possibleEntities.isEmpty()) {
|
||||
log.warn("Entity could not be created for manual add entry: {}, due to the string not being found.", entry);
|
||||
return;
|
||||
}
|
||||
|
||||
List<Rectangle> originalPositions = entry.getPositions().stream().map(CustomEntityCreationAdapter::toRectangle).toList();
|
||||
findClosestRedactionEntity(originalPositions, possibleEntities).ifPresent(closestEntity -> createCorrectEntity(entry, node, closestEntity));
|
||||
});
|
||||
|
||||
tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph);
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle toRectangle(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) {
|
||||
|
||||
return new Rectangle(new Point(rectangle.getTopLeftX(), rectangle.getTopLeftY()), rectangle.getWidth(), rectangle.getHeight(), rectangle.getPage());
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, List<Integer> pageNumbers, Set<String> entryValues) {
|
||||
private Map<String, List<RedactionEntity>> findAllPossibleEntitiesAndGroupByValue(SemanticNode node, Set<Integer> pageNumbers, Set<String> entryValues) {
|
||||
|
||||
if (!pageNumbers.stream().allMatch(node::isOnPage)) {
|
||||
throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log",
|
||||
@ -136,69 +160,22 @@ public class CustomEntityCreationAdapter {
|
||||
}
|
||||
|
||||
|
||||
private Optional<RedactionEntity> findClosestRedactionEntity(List<Rectangle> originalPositions, List<RedactionEntity> entitiesWithSameValue) {
|
||||
|
||||
RedactionEntity closestEntity = entitiesWithSameValue.stream()
|
||||
.filter(entity -> pagesMatch(entity, originalPositions))
|
||||
.min(Comparator.comparingDouble(entity -> calculateMinDistance(originalPositions, entity)))
|
||||
.orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", originalPositions)));
|
||||
|
||||
double distance = calculateMinDistance(originalPositions, closestEntity);
|
||||
if (distance > MATCH_THRESHOLD) {
|
||||
log.warn(format("Distance to closest found entity is %.2f and therefore higher than the threshold of %.2f for \n%s \n%s",
|
||||
distance,
|
||||
MATCH_THRESHOLD,
|
||||
originalPositions,
|
||||
closestEntity.getRedactionPositionsPerPage()));
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(closestEntity);
|
||||
}
|
||||
|
||||
|
||||
private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
|
||||
|
||||
RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestEntity.getBoundary(),
|
||||
redactionLogEntry.getType(),
|
||||
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
|
||||
node);
|
||||
String ruleIdentifier = redactionLogEntry.getType() + "." + redactionLogEntry.getMatchedRule() + ".0";
|
||||
if (redactionLogEntry.isRedacted()) {
|
||||
correctEntity.apply(ruleIdentifier, redactionLogEntry.getReason(), redactionLogEntry.getLegalBasis());
|
||||
} else {
|
||||
correctEntity.skip(ruleIdentifier, redactionLogEntry.getReason());
|
||||
}
|
||||
correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry());
|
||||
correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry());
|
||||
return correctEntity;
|
||||
}
|
||||
|
||||
|
||||
private RedactionEntity createCorrectEntity(ManualRedactionEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) {
|
||||
|
||||
RedactionEntity correctEntity = entityCreationService.forceByBoundary(closestEntity.getBoundary(), redactionLogEntry.getType(), EntityType.ENTITY, node);
|
||||
|
||||
correctEntity.force("MAN.0.0", redactionLogEntry.getReason(), redactionLogEntry.getLegalBasis());
|
||||
|
||||
return correctEntity;
|
||||
}
|
||||
|
||||
|
||||
private static boolean pagesMatch(RedactionEntity entity, List<Rectangle> originalPositions) {
|
||||
private static boolean pagesMatch(RedactionEntity entity, List<RectangleWithPage> originalPositions) {
|
||||
|
||||
Set<Integer> entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet());
|
||||
Set<Integer> redactionLogEntryPageNumbers = originalPositions.stream().map(Rectangle::getPage).collect(Collectors.toSet());
|
||||
return entityPageNumbers.equals(redactionLogEntryPageNumbers);
|
||||
Set<Integer> originalPageNumbers = originalPositions.stream().map(RectangleWithPage::pageNumber).collect(Collectors.toSet());
|
||||
return entityPageNumbers.containsAll(originalPageNumbers);
|
||||
}
|
||||
|
||||
|
||||
private double calculateMinDistance(List<Rectangle> originalPositions, RedactionEntity entity) {
|
||||
private double calculateMinDistance(List<RectangleWithPage> originalPositions, RedactionEntity entity) {
|
||||
|
||||
if (originalPositions.size() != countRectangles(entity)) {
|
||||
return Double.MAX_VALUE;
|
||||
}
|
||||
return originalPositions.stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum();
|
||||
return originalPositions.stream()
|
||||
.mapToDouble(rectangleWithPage -> calculateMinDistancePerRectangle(entity, rectangleWithPage.pageNumber(), rectangleWithPage.rectangle2D()))
|
||||
.sum();
|
||||
}
|
||||
|
||||
|
||||
@ -208,14 +185,14 @@ public class CustomEntityCreationAdapter {
|
||||
}
|
||||
|
||||
|
||||
private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle originalRectangle) {
|
||||
private double calculateMinDistancePerRectangle(RedactionEntity entity, int pageNumber, Rectangle2D originalRectangle) {
|
||||
|
||||
return entity.getRedactionPositionsPerPage()
|
||||
.stream()
|
||||
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == originalRectangle.getPage())
|
||||
.filter(redactionPosition -> redactionPosition.getPage().getNumber() == pageNumber)
|
||||
.map(RedactionPosition::getRectanglePerLine)
|
||||
.flatMap(Collection::stream)
|
||||
.mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(originalRectangle)))
|
||||
.mapToDouble(rectangle -> calculateDistance(rectangle, originalRectangle))
|
||||
.min()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
}
|
||||
@ -230,9 +207,77 @@ public class CustomEntityCreationAdapter {
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D toRectangle2D(Rectangle rectangle) {
|
||||
private record EntityIdentifier(
|
||||
String value,
|
||||
List<RectangleWithPage> entityPosition,
|
||||
String ruleIdentifier,
|
||||
String reason,
|
||||
String legalBasis,
|
||||
String type,
|
||||
EntityType entityType,
|
||||
boolean redacted,
|
||||
boolean isDictionaryEntry,
|
||||
boolean isDossierDictionaryEntry) {
|
||||
|
||||
public static EntityIdentifier fromRedactionLogEntry(RedactionLogEntry redactionLogEntry) {
|
||||
|
||||
String ruleIdentifier = redactionLogEntry.getType() + "." + redactionLogEntry.getMatchedRule() + ".0";
|
||||
List<RectangleWithPage> rectangleWithPages = redactionLogEntry.getPositions().stream().map(RectangleWithPage::fromRedactionLogRectangle).toList();
|
||||
return new EntityIdentifier(redactionLogEntry.getValue(),
|
||||
rectangleWithPages,
|
||||
ruleIdentifier,
|
||||
redactionLogEntry.getReason(),
|
||||
redactionLogEntry.getLegalBasis(),
|
||||
redactionLogEntry.getType(),
|
||||
redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY,
|
||||
redactionLogEntry.isRedacted(),
|
||||
redactionLogEntry.isDictionaryEntry(),
|
||||
redactionLogEntry.isDossierDictionaryEntry());
|
||||
}
|
||||
|
||||
|
||||
public static EntityIdentifier fromManualRedactionEntry(ManualRedactionEntry manualRedactionEntry) {
|
||||
|
||||
List<RectangleWithPage> rectangleWithPages = manualRedactionEntry.getPositions().stream().map(RectangleWithPage::fromAnnotationRectangle).toList();
|
||||
return new EntityIdentifier(manualRedactionEntry.getValue(),
|
||||
rectangleWithPages,
|
||||
"MAN.0.0",
|
||||
manualRedactionEntry.getReason(),
|
||||
manualRedactionEntry.getLegalBasis(),
|
||||
manualRedactionEntry.getType(),
|
||||
EntityType.ENTITY,
|
||||
true,
|
||||
false,
|
||||
false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private record RectangleWithPage(int pageNumber, Rectangle2D rectangle2D) {
|
||||
|
||||
public static RectangleWithPage fromRedactionLogRectangle(Rectangle rectangle) {
|
||||
|
||||
return new RectangleWithPage(rectangle.getPage(), toRectangle2D(rectangle));
|
||||
}
|
||||
|
||||
|
||||
public static RectangleWithPage fromAnnotationRectangle(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) {
|
||||
|
||||
return new RectangleWithPage(rectangle.getPage(), toRectangle2D(rectangle));
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D toRectangle2D(Rectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D toRectangle2D(com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.Rectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
|
||||
}
|
||||
|
||||
return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -58,6 +59,16 @@ public interface SemanticNode {
|
||||
return getTextBlock().getPages();
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the first page associated with this Node
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Page getFirstPage() {
|
||||
|
||||
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||
|
||||
@ -13,6 +13,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.CustomEntityCreationAdapter;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService;
|
||||
@ -75,7 +76,8 @@ public class EntityRedactionService {
|
||||
|
||||
public void addManualAddRedactionEntities(Set<ManualRedactionEntry> manualRedactionEntries, Document document) {
|
||||
|
||||
customEntityCreationAdapter.createRedactionEntities(manualRedactionEntries, document);
|
||||
// Entities are automatically added to the DocumentGraph and don't need to be inserted again.
|
||||
List<RedactionEntity> entities = customEntityCreationAdapter.createRedactionEntities(manualRedactionEntries, document).toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user