diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 4851516a..2cbfd2c8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -12,6 +12,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -71,36 +72,59 @@ public class Section { private List fileAttributes = new ArrayList<>(); + public void addAiEntities(String type, String asType) { - public void addAiEntities(String type, String asType){ + Set entitiesOfType = nerEntities.stream() + .filter(nerEntity -> nerEntity.getType().equals(type)) + .collect(Collectors.toSet()); + Set values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toSet()); + Set found = EntitySearchUtils.find(text, values, asType, headline, sectionNumber, false, false, Engine.NER); + EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary); - Set entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet()); - entitiesOfType.forEach(nerEntity -> nerEntity.setType(asType)); - EntitySearchUtils.clearAndFindPositions(entitiesOfType, searchableText, dictionary); - EntitySearchUtils.addEntitiesWithHigherRank(entities, entitiesOfType, dictionary); + Set finalResult = new HashSet<>(); + + // Only keep Entities with correct offsets from AI Service. + Iterator itty = found.iterator(); + while (itty.hasNext()) { + Entity current = itty.next(); + boolean foundSameOffsets = false; + for (Entity entity : entitiesOfType) { + if (entity.getStart().equals(current.getStart()) && entity.getEnd().equals(current.getEnd())) { + foundSameOffsets = true; + } + } + if (foundSameOffsets) { + finalResult.add(current); + } + } + + EntitySearchUtils.addEntitiesWithHigherRank(entities, finalResult, dictionary); EntitySearchUtils.removeEntitiesContainedInLarger(entities); nerEntities.removeAll(entitiesOfType); } - public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType, int minPartMatches, boolean allowDuplicateTypes){ + public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType, + int minPartMatches, boolean allowDuplicateTypes) { Set combineSet = Set.of(combineTypes.split(",")); - List sorted = nerEntities.stream().sorted(Comparator.comparing(Entity::getStart)).collect(Collectors.toList()); + List sorted = nerEntities.stream() + .sorted(Comparator.comparing(Entity::getStart)) + .collect(Collectors.toList()); Set found = new HashSet<>(); int start = -1; int lastEnd = -1; int numberOfMatchParts = 0; Set foundParts = new HashSet<>(); - for (Entity entity : sorted){ - if(entity.getType().equals(startType) && start == -1) { + for (Entity entity : sorted) { + if (entity.getType().equals(startType) && start == -1) { lastEnd = entity.getEnd(); start = entity.getStart(); foundParts.add(entity.getType()); numberOfMatchParts++; - } else if(!allowDuplicateTypes && foundParts.contains(entity.getType())){ - if(numberOfMatchParts >= minPartMatches) { + } else if (!allowDuplicateTypes && foundParts.contains(entity.getType())) { + if (numberOfMatchParts >= minPartMatches) { String value = searchText.substring(start, lastEnd); found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER)); } @@ -108,14 +132,14 @@ public class Section { lastEnd = -1; numberOfMatchParts = 0; foundParts = new HashSet<>(); - if(entity.getType().equals(startType)){ + if (entity.getType().equals(startType)) { lastEnd = entity.getEnd(); start = entity.getStart(); foundParts.add(entity.getType()); numberOfMatchParts++; } - } else if(entity.getType().equals(startType) && start != -1){ - if(numberOfMatchParts >= minPartMatches) { + } else if (entity.getType().equals(startType) && start != -1) { + if (numberOfMatchParts >= minPartMatches) { String value = searchText.substring(start, lastEnd); found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER)); } @@ -125,24 +149,25 @@ public class Section { foundParts = new HashSet<>(); foundParts.add(entity.getType()); numberOfMatchParts++; - } else if(start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween){ + } else if (start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween) { lastEnd = entity.getEnd(); numberOfMatchParts++; foundParts.add(entity.getType()); } } - if(numberOfMatchParts >= minPartMatches) { + if (numberOfMatchParts >= minPartMatches) { String value = searchText.substring(start, lastEnd); found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER)); } - if(!found.isEmpty()) { + if (!found.isEmpty()) { EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); EntitySearchUtils.removeEntitiesContainedInLarger(entities); } } + @WhenCondition public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id, @Argument(ArgumentType.STRING) String value) { @@ -235,6 +260,7 @@ public class Section { return nerEntities.stream().anyMatch(entity -> !entity.isIgnored() && entity.getType().equals(type)); } + @WhenCondition public boolean matchesType(@Argument(ArgumentType.TYPE) String type) { @@ -257,7 +283,8 @@ public class Section { @ThenAction - public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String suffixPattern, + public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, + @Argument(ArgumentType.REGEX) String suffixPattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group) { @@ -266,7 +293,8 @@ public class Section { @ThenAction - public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String suffixPattern, + public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, + @Argument(ArgumentType.REGEX) String suffixPattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.REGEX) String valuePattern) { @@ -378,7 +406,8 @@ public class Section { } - public void ignore(String type){ + public void ignore(String type) { + entities.removeIf(entity -> entity.getType().equals(type)); } @@ -702,14 +731,8 @@ public class Section { } - private Set findEntities(String value, - String asType, - boolean caseInsensitive, - boolean redacted, - int ruleNumber, - String reason, - String legalBasis, - Engine engine) { + private Set findEntities(String value, String asType, boolean caseInsensitive, boolean redacted, + int ruleNumber, String reason, String legalBasis, Engine engine) { String text = caseInsensitive ? searchText.toLowerCase() : searchText; String searchValue = caseInsensitive ? value.toLowerCase() : value; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 1955cd46..f5eaba79 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -246,28 +246,23 @@ public class EntityRedactionService { Set nerFound = new HashSet<>(); if (!local) { - Map> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts); - nerValuesPerType.entrySet().forEach(entry -> { - EntitySearchUtils.addEntitiesWithHigherRank(nerFound, EntitySearchUtils.find(searchableString, entry.getValue(), entry - .getKey(), headline, sectionNumber, false, false, Engine.NER), dictionary); - }); + nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellstarts, headline)); } return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary), nerFound) ; } - private Map> getNerValues(int sectionNumber, NerEntities nerEntities, - List cellstarts) { + private Set getNerValues(int sectionNumber, NerEntities nerEntities, + List cellstarts, String headline) { - Map> nerValuesPerType = new HashMap<>(); + Set entities = new HashSet<>(); if (redactionServiceSettings.isEnableEntityRecognition() && nerEntities.getResult() .containsKey(sectionNumber)) { nerEntities.getResult().get(sectionNumber).forEach(res -> { if (cellstarts == null || cellstarts.isEmpty()) { - nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>()) - .add(new String(Base64.decodeBase64(res.getValue().getBytes()))); + entities.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false, Engine.NER)); } else { boolean intersectsCellStart = false; for (Integer cellStart : cellstarts) { @@ -276,13 +271,12 @@ public class EntityRedactionService { } } if (!intersectsCellStart) { - nerValuesPerType.computeIfAbsent(res.getType(), (a) -> new HashSet<>()) - .add(new String(Base64.decodeBase64(res.getValue().getBytes()))); + entities.add(new Entity(new String(Base64.decodeBase64(res.getValue().getBytes())), res.getType(), res.getStartOffset(), res.getEndOffset(), headline, sectionNumber, false, false, Engine.NER)); } } }); } - return nerValuesPerType; + return entities; } }