Pull request #329: RED-3133: Fixed wrong ai entries
Merge in RED/redaction-service from RED-3133-Fix to master * commit '921ae1839cb424a4d82afd88401fddc9cf855f8c': RED-3133: Fixed wrong ai entries
This commit is contained in:
commit
b8f7be28e7
@ -12,6 +12,7 @@ import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
@ -71,36 +72,59 @@ public class Section {
|
||||
private List<FileAttribute> fileAttributes = new ArrayList<>();
|
||||
|
||||
|
||||
public void addAiEntities(String type, String asType) {
|
||||
|
||||
public void addAiEntities(String type, String asType){
|
||||
Set<Entity> entitiesOfType = nerEntities.stream()
|
||||
.filter(nerEntity -> nerEntity.getType().equals(type))
|
||||
.collect(Collectors.toSet());
|
||||
Set<String> values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toSet());
|
||||
Set<Entity> found = EntitySearchUtils.find(text, values, asType, headline, sectionNumber, false, false, Engine.NER);
|
||||
EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
|
||||
|
||||
Set<Entity> entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet());
|
||||
entitiesOfType.forEach(nerEntity -> nerEntity.setType(asType));
|
||||
EntitySearchUtils.clearAndFindPositions(entitiesOfType, searchableText, dictionary);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, entitiesOfType, dictionary);
|
||||
Set<Entity> finalResult = new HashSet<>();
|
||||
|
||||
// Only keep Entities with correct offsets from AI Service.
|
||||
Iterator<Entity> itty = found.iterator();
|
||||
while (itty.hasNext()) {
|
||||
Entity current = itty.next();
|
||||
boolean foundSameOffsets = false;
|
||||
for (Entity entity : entitiesOfType) {
|
||||
if (entity.getStart().equals(current.getStart()) && entity.getEnd().equals(current.getEnd())) {
|
||||
foundSameOffsets = true;
|
||||
}
|
||||
}
|
||||
if (foundSameOffsets) {
|
||||
finalResult.add(current);
|
||||
}
|
||||
}
|
||||
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, finalResult, dictionary);
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
nerEntities.removeAll(entitiesOfType);
|
||||
}
|
||||
|
||||
|
||||
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType, int minPartMatches, boolean allowDuplicateTypes){
|
||||
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType,
|
||||
int minPartMatches, boolean allowDuplicateTypes) {
|
||||
|
||||
Set<String> combineSet = Set.of(combineTypes.split(","));
|
||||
|
||||
List<Entity> sorted = nerEntities.stream().sorted(Comparator.comparing(Entity::getStart)).collect(Collectors.toList());
|
||||
List<Entity> sorted = nerEntities.stream()
|
||||
.sorted(Comparator.comparing(Entity::getStart))
|
||||
.collect(Collectors.toList());
|
||||
Set<Entity> found = new HashSet<>();
|
||||
int start = -1;
|
||||
int lastEnd = -1;
|
||||
int numberOfMatchParts = 0;
|
||||
Set<String> foundParts = new HashSet<>();
|
||||
for (Entity entity : sorted){
|
||||
if(entity.getType().equals(startType) && start == -1) {
|
||||
for (Entity entity : sorted) {
|
||||
if (entity.getType().equals(startType) && start == -1) {
|
||||
lastEnd = entity.getEnd();
|
||||
start = entity.getStart();
|
||||
foundParts.add(entity.getType());
|
||||
numberOfMatchParts++;
|
||||
} else if(!allowDuplicateTypes && foundParts.contains(entity.getType())){
|
||||
if(numberOfMatchParts >= minPartMatches) {
|
||||
} else if (!allowDuplicateTypes && foundParts.contains(entity.getType())) {
|
||||
if (numberOfMatchParts >= minPartMatches) {
|
||||
String value = searchText.substring(start, lastEnd);
|
||||
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
|
||||
}
|
||||
@ -108,14 +132,14 @@ public class Section {
|
||||
lastEnd = -1;
|
||||
numberOfMatchParts = 0;
|
||||
foundParts = new HashSet<>();
|
||||
if(entity.getType().equals(startType)){
|
||||
if (entity.getType().equals(startType)) {
|
||||
lastEnd = entity.getEnd();
|
||||
start = entity.getStart();
|
||||
foundParts.add(entity.getType());
|
||||
numberOfMatchParts++;
|
||||
}
|
||||
} else if(entity.getType().equals(startType) && start != -1){
|
||||
if(numberOfMatchParts >= minPartMatches) {
|
||||
} else if (entity.getType().equals(startType) && start != -1) {
|
||||
if (numberOfMatchParts >= minPartMatches) {
|
||||
String value = searchText.substring(start, lastEnd);
|
||||
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
|
||||
}
|
||||
@ -125,24 +149,25 @@ public class Section {
|
||||
foundParts = new HashSet<>();
|
||||
foundParts.add(entity.getType());
|
||||
numberOfMatchParts++;
|
||||
} else if(start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween){
|
||||
} else if (start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween) {
|
||||
lastEnd = entity.getEnd();
|
||||
numberOfMatchParts++;
|
||||
foundParts.add(entity.getType());
|
||||
}
|
||||
}
|
||||
|
||||
if(numberOfMatchParts >= minPartMatches) {
|
||||
if (numberOfMatchParts >= minPartMatches) {
|
||||
String value = searchText.substring(start, lastEnd);
|
||||
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
|
||||
}
|
||||
|
||||
if(!found.isEmpty()) {
|
||||
if (!found.isEmpty()) {
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
@ -235,6 +260,7 @@ public class Section {
|
||||
return nerEntities.stream().anyMatch(entity -> !entity.isIgnored() && entity.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
@ -257,7 +283,8 @@ public class Section {
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String suffixPattern,
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.REGEX) String suffixPattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@Argument(ArgumentType.INTEGER) int group) {
|
||||
|
||||
@ -266,7 +293,8 @@ public class Section {
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String suffixPattern,
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.REGEX) String suffixPattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@Argument(ArgumentType.INTEGER) int group,
|
||||
@Argument(ArgumentType.REGEX) String valuePattern) {
|
||||
@ -378,7 +406,8 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public void ignore(String type){
|
||||
public void ignore(String type) {
|
||||
|
||||
entities.removeIf(entity -> entity.getType().equals(type));
|
||||
}
|
||||
|
||||
@ -702,14 +731,8 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(String value,
|
||||
String asType,
|
||||
boolean caseInsensitive,
|
||||
boolean redacted,
|
||||
int ruleNumber,
|
||||
String reason,
|
||||
String legalBasis,
|
||||
Engine engine) {
|
||||
private Set<Entity> findEntities(String value, String asType, boolean caseInsensitive, boolean redacted,
|
||||
int ruleNumber, String reason, String legalBasis, Engine engine) {
|
||||
|
||||
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
|
||||
String searchValue = caseInsensitive ? value.toLowerCase() : value;
|
||||
|
||||
@ -246,28 +246,23 @@ public class EntityRedactionService {
|
||||
|
||||
Set<Entity> nerFound = new HashSet<>();
|
||||
if (!local) {
|
||||
Map<String, Set<String>> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts);
|
||||
nerValuesPerType.entrySet().forEach(entry -> {
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(nerFound, EntitySearchUtils.find(searchableString, entry.getValue(), entry
|
||||
.getKey(), headline, sectionNumber, false, false, Engine.NER), dictionary);
|
||||
});
|
||||
nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellstarts, headline));
|
||||
}
|
||||
|
||||
return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary), nerFound) ;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, Set<String>> getNerValues(int sectionNumber, NerEntities nerEntities,
|
||||
List<Integer> cellstarts) {
|
||||
private Set<Entity> getNerValues(int sectionNumber, NerEntities nerEntities,
|
||||
List<Integer> cellstarts, String headline) {
|
||||
|
||||
Map<String, Set<String>> nerValuesPerType = new HashMap<>();
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
|
||||
if (redactionServiceSettings.isEnableEntityRecognition() && nerEntities.getResult()
|
||||
.containsKey(sectionNumber)) {
|
||||
nerEntities.getResult().get(sectionNumber).forEach(res -> {
|
||||
if (cellstarts == null || cellstarts.isEmpty()) {
|
||||
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>())
|
||||
.add(new String(Base64.decodeBase64(res.getValue().getBytes())));
|
||||
entities.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false, Engine.NER));
|
||||
} else {
|
||||
boolean intersectsCellStart = false;
|
||||
for (Integer cellStart : cellstarts) {
|
||||
@ -276,13 +271,12 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
if (!intersectsCellStart) {
|
||||
nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>())
|
||||
.add(new String(Base64.decodeBase64(res.getValue().getBytes())));
|
||||
entities.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false, Engine.NER));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
return nerValuesPerType;
|
||||
return entities;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user