diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 73b35204..9cc81ce7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -261,9 +261,7 @@ public class Section { @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.REGEX) String valuePattern) { - if (StringUtils.isEmpty(prefixPattern)) { - return; - } + if (StringUtils.isEmpty(prefixPattern)) return; var compiledValuePattern = valuePattern == null ? null : Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive); var compiledPrefixPattern = Patterns.getCompiledPattern(prefixPattern, patternCaseInsensitive); @@ -318,13 +316,10 @@ public class Section { @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.REGEX) String valuePattern) { - Pattern compiledValuePattern = null; + if (StringUtils.isEmpty(suffixPattern)) return; - if (valuePattern != null) { - compiledValuePattern = Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive); - } - - Pattern compiledPattern = Patterns.getCompiledPattern(suffixPattern, patternCaseInsensitive); + var compiledValuePattern = valuePattern == null ? null : Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive); + var compiledSuffixPattern = Patterns.getCompiledPattern(suffixPattern, patternCaseInsensitive); Set expanded = new HashSet<>(); for (Entity entity : entities) { @@ -334,16 +329,16 @@ public class Section { } if (valuePattern != null) { - Matcher valueMatcher = compiledValuePattern.matcher(entity.getWord()); + var valueMatcher = compiledValuePattern.matcher(entity.getWord()); if (!valueMatcher.matches()) { continue; } } - Matcher matcher = compiledPattern.matcher(entity.getTextAfter()); + var matcher = compiledSuffixPattern.matcher(entity.getTextAfter()); while (matcher.find()) { - String match = matcher.group(group); + var match = matcher.group(group); if (StringUtils.isNotBlank(match)) { @@ -352,7 +347,7 @@ public class Section { continue; } - Set expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE, false); + var expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE, false); expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities)); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index e4542ebe..c329bd76 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -286,12 +286,16 @@ public class EntitySearchUtils { } } - + /* + * returns true if the found entity overlaps with an existing entity in a way + * that neither entity is a subset of the other + */ private boolean overlaps(Set existingEntities, Entity found) { for (Entity existing : existingEntities) { - if (existing.getStart().equals(found.getStart())) { + // skip if either start or end is equal + if (existing.getStart().equals(found.getStart()) || existing.getEnd().equals(found.getEnd())) { continue; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 51468caa..f720e847 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -303,6 +303,8 @@ public class RedactionIntegrationTest { @Test public void redactionExpansionOverlap() throws IOException { + // F. Lastname, J. Doe, M. Mustermann + // Lastname M., Doe J., Mustermann M. ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/ExpansionTest.pdf"); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); @@ -1534,6 +1536,44 @@ public class RedactionIntegrationTest { } } + @Test + public void testExpandByPrefixRegEx() throws IOException { + + assertThat(dictionary.get(AUTHOR).contains("Robinson")); + assertThat(! dictionary.get(AUTHOR).contains("Mrs. Robinson")); + assertThat(dictionary.get(AUTHOR).contains("Bojangles")); + assertThat(! dictionary.get(AUTHOR).contains("Mr. Bojangles")); + assertThat(dictionary.get(AUTHOR).contains("Tambourine Man")); + assertThat(! dictionary.get(AUTHOR).contains("Mr. Tambourine Man")); + + String fileName = "files/mr-mrs.pdf"; + String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf"; + + ClassPathResource pdfFileResource = new ClassPathResource(fileName); + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + analyzeService.analyze(request); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + var values = redactionLog.getRedactionLogEntry() + .stream() + .map(RedactionLogEntry::getValue) + .collect(Collectors.toList()); + + assertThat(values).contains("Mrs. Robinson"); + assertThat(values).contains("Mr. Bojangles"); + assertThat(values).contains("Mr. Tambourine Man"); + } @SneakyThrows private AnalyzeRequest prepareStorage(InputStream stream) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java index 7db8f371..940ec9da 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java @@ -145,14 +145,17 @@ public class EntitySearchUtilsTest { Set foundEntities = new HashSet<>(); Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); + Entity foundEntities3 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); foundEntities.add(foundEntities1); foundEntities.add(foundEntities2); + foundEntities.add(foundEntities3); // Act Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); // Assert - assertThat(result.size()).isEqualTo(0); + assertThat(result.size()).isEqualTo(1); + assertThat(result.contains(foundEntities2)); } @@ -173,16 +176,21 @@ public class EntitySearchUtilsTest { existingEntities.add(existingEntity2); Set foundEntities = new HashSet<>(); - Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); - Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); - foundEntities.add(foundEntities1); - foundEntities.add(foundEntities2); + Entity foundEntitiesOverlap1 = new Entity("Batman X. Superman Y.", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); + Entity foundEntitiesOverlap2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); + Entity foundEntitiesSubset1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); + Entity foundEntitiesSubset2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY); + foundEntities.add(foundEntitiesOverlap1); + foundEntities.add(foundEntitiesOverlap2); + foundEntities.add(foundEntitiesSubset1); + foundEntities.add(foundEntitiesSubset2); // Act Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); // Assert - assertThat(result.size()).isEqualTo(0); + assertThat(result.size()).isEqualTo(2); + assertThat(result).containsExactlyInAnyOrder(foundEntitiesSubset1, foundEntitiesSubset2); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt index 634f9d81..344db2ca 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt @@ -8620,4 +8620,6 @@ Doe M. Mustermann F. Lastname Mustermann -Lastname \ No newline at end of file +Lastname +Bojangles +Tambourine Man \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 1de194b4..35d751b7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -36,6 +36,23 @@ rule "0: Expand CBI Authors with firstname initials" section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); end +rule "0: Expand CBI_author and PII matches with salutation prefix" + when + Section((matchesType("CBI_author") || matchesType("PII")) && ( + searchText.contains("Mr") + || searchText.contains("Mrs") + || searchText.contains("Ms") + || searchText.contains("Miss") + || searchText.contains("Sir") + || searchText.contains("Madam") + || searchText.contains("Madame") + || searchText.contains("Mme") + )) + then + section.expandByPrefixRegEx("CBI_author", "\\b(Mrs?|Ms|Miss|Sir|Madame?|Mme)\\s?\\.?\\s*", false, 0); + section.expandByPrefixRegEx("PII", "\\b(Mrs?|Ms|Miss|Sir|Madame?|Mme)\\s?\\.?\\s*", false, 0); + end + rule "1: Redacted because Section contains Vertebrate" when diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/mr-mrs.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/mr-mrs.pdf new file mode 100644 index 00000000..0e8f5046 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/mr-mrs.pdf differ