Pull request #329: RED-3133: Fixed wrong ai entries

Merge in RED/redaction-service from RED-3133-Fix to master

* commit '921ae1839cb424a4d82afd88401fddc9cf855f8c':
  RED-3133: Fixed wrong ai entries
This commit is contained in:
Dominique Eiflaender 2022-01-26 15:09:04 +01:00
commit b8f7be28e7
2 changed files with 58 additions and 41 deletions

View File

@ -12,6 +12,7 @@ import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -71,36 +72,59 @@ public class Section {
private List<FileAttribute> fileAttributes = new ArrayList<>();
public void addAiEntities(String type, String asType) {
public void addAiEntities(String type, String asType){
Set<Entity> entitiesOfType = nerEntities.stream()
.filter(nerEntity -> nerEntity.getType().equals(type))
.collect(Collectors.toSet());
Set<String> values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toSet());
Set<Entity> found = EntitySearchUtils.find(text, values, asType, headline, sectionNumber, false, false, Engine.NER);
EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
Set<Entity> entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet());
entitiesOfType.forEach(nerEntity -> nerEntity.setType(asType));
EntitySearchUtils.clearAndFindPositions(entitiesOfType, searchableText, dictionary);
EntitySearchUtils.addEntitiesWithHigherRank(entities, entitiesOfType, dictionary);
Set<Entity> finalResult = new HashSet<>();
// Only keep Entities with correct offsets from AI Service.
Iterator<Entity> itty = found.iterator();
while (itty.hasNext()) {
Entity current = itty.next();
boolean foundSameOffsets = false;
for (Entity entity : entitiesOfType) {
if (entity.getStart().equals(current.getStart()) && entity.getEnd().equals(current.getEnd())) {
foundSameOffsets = true;
}
}
if (foundSameOffsets) {
finalResult.add(current);
}
}
EntitySearchUtils.addEntitiesWithHigherRank(entities, finalResult, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
nerEntities.removeAll(entitiesOfType);
}
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType, int minPartMatches, boolean allowDuplicateTypes){
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType,
int minPartMatches, boolean allowDuplicateTypes) {
Set<String> combineSet = Set.of(combineTypes.split(","));
List<Entity> sorted = nerEntities.stream().sorted(Comparator.comparing(Entity::getStart)).collect(Collectors.toList());
List<Entity> sorted = nerEntities.stream()
.sorted(Comparator.comparing(Entity::getStart))
.collect(Collectors.toList());
Set<Entity> found = new HashSet<>();
int start = -1;
int lastEnd = -1;
int numberOfMatchParts = 0;
Set<String> foundParts = new HashSet<>();
for (Entity entity : sorted){
if(entity.getType().equals(startType) && start == -1) {
for (Entity entity : sorted) {
if (entity.getType().equals(startType) && start == -1) {
lastEnd = entity.getEnd();
start = entity.getStart();
foundParts.add(entity.getType());
numberOfMatchParts++;
} else if(!allowDuplicateTypes && foundParts.contains(entity.getType())){
if(numberOfMatchParts >= minPartMatches) {
} else if (!allowDuplicateTypes && foundParts.contains(entity.getType())) {
if (numberOfMatchParts >= minPartMatches) {
String value = searchText.substring(start, lastEnd);
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
}
@ -108,14 +132,14 @@ public class Section {
lastEnd = -1;
numberOfMatchParts = 0;
foundParts = new HashSet<>();
if(entity.getType().equals(startType)){
if (entity.getType().equals(startType)) {
lastEnd = entity.getEnd();
start = entity.getStart();
foundParts.add(entity.getType());
numberOfMatchParts++;
}
} else if(entity.getType().equals(startType) && start != -1){
if(numberOfMatchParts >= minPartMatches) {
} else if (entity.getType().equals(startType) && start != -1) {
if (numberOfMatchParts >= minPartMatches) {
String value = searchText.substring(start, lastEnd);
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
}
@ -125,24 +149,25 @@ public class Section {
foundParts = new HashSet<>();
foundParts.add(entity.getType());
numberOfMatchParts++;
} else if(start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween){
} else if (start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween) {
lastEnd = entity.getEnd();
numberOfMatchParts++;
foundParts.add(entity.getType());
}
}
if(numberOfMatchParts >= minPartMatches) {
if (numberOfMatchParts >= minPartMatches) {
String value = searchText.substring(start, lastEnd);
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
}
if(!found.isEmpty()) {
if (!found.isEmpty()) {
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
}
@WhenCondition
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
@Argument(ArgumentType.STRING) String value) {
@ -235,6 +260,7 @@ public class Section {
return nerEntities.stream().anyMatch(entity -> !entity.isIgnored() && entity.getType().equals(type));
}
@WhenCondition
public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
@ -257,7 +283,8 @@ public class Section {
@ThenAction
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String suffixPattern,
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String suffixPattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group) {
@ -266,7 +293,8 @@ public class Section {
@ThenAction
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String suffixPattern,
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String suffixPattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.REGEX) String valuePattern) {
@ -378,7 +406,8 @@ public class Section {
}
public void ignore(String type){
public void ignore(String type) {
entities.removeIf(entity -> entity.getType().equals(type));
}
@ -702,14 +731,8 @@ public class Section {
}
private Set<Entity> findEntities(String value,
String asType,
boolean caseInsensitive,
boolean redacted,
int ruleNumber,
String reason,
String legalBasis,
Engine engine) {
private Set<Entity> findEntities(String value, String asType, boolean caseInsensitive, boolean redacted,
int ruleNumber, String reason, String legalBasis, Engine engine) {
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
String searchValue = caseInsensitive ? value.toLowerCase() : value;

View File

@ -246,28 +246,23 @@ public class EntityRedactionService {
Set<Entity> nerFound = new HashSet<>();
if (!local) {
Map<String, Set<String>> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts);
nerValuesPerType.entrySet().forEach(entry -> {
EntitySearchUtils.addEntitiesWithHigherRank(nerFound, EntitySearchUtils.find(searchableString, entry.getValue(), entry
.getKey(), headline, sectionNumber, false, false, Engine.NER), dictionary);
});
nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellstarts, headline));
}
return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary), nerFound) ;
}
private Map<String, Set<String>> getNerValues(int sectionNumber, NerEntities nerEntities,
List<Integer> cellstarts) {
private Set<Entity> getNerValues(int sectionNumber, NerEntities nerEntities,
List<Integer> cellstarts, String headline) {
Map<String, Set<String>> nerValuesPerType = new HashMap<>();
Set<Entity> entities = new HashSet<>();
if (redactionServiceSettings.isEnableEntityRecognition() && nerEntities.getResult()
.containsKey(sectionNumber)) {
nerEntities.getResult().get(sectionNumber).forEach(res -> {
if (cellstarts == null || cellstarts.isEmpty()) {
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>())
.add(new String(Base64.decodeBase64(res.getValue().getBytes())));
entities.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false, Engine.NER));
} else {
boolean intersectsCellStart = false;
for (Integer cellStart : cellstarts) {
@ -276,13 +271,12 @@ public class EntityRedactionService {
}
}
if (!intersectsCellStart) {
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>())
.add(new String(Base64.decodeBase64(res.getValue().getBytes())));
entities.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false, Engine.NER));
}
}
});
}
return nerValuesPerType;
return entities;
}
}