Pull request #371: RED-3761 Fix expandByRegEx and expandByPrefixRegEx

Merge in RED/redaction-service from kbudisantoso/RED-3761-fix-filter-expanded to master

* commit '5b5d898b933c3ae79fb7cb07a6105b15d2952f8e':
  RED-3761 Fix expandByRegEx and expandByPrefixRegEx
This commit is contained in:
Kresnadi Budisantoso 2022-04-11 12:51:57 +02:00 committed by Dominique Eiflaender
commit efd53066c4
7 changed files with 88 additions and 22 deletions

View File

@ -261,9 +261,7 @@ public class Section {
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.REGEX) String valuePattern) {
if (StringUtils.isEmpty(prefixPattern)) {
return;
}
if (StringUtils.isEmpty(prefixPattern)) return;
var compiledValuePattern = valuePattern == null ? null : Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive);
var compiledPrefixPattern = Patterns.getCompiledPattern(prefixPattern, patternCaseInsensitive);
@ -318,13 +316,10 @@ public class Section {
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.REGEX) String valuePattern) {
Pattern compiledValuePattern = null;
if (StringUtils.isEmpty(suffixPattern)) return;
if (valuePattern != null) {
compiledValuePattern = Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive);
}
Pattern compiledPattern = Patterns.getCompiledPattern(suffixPattern, patternCaseInsensitive);
var compiledValuePattern = valuePattern == null ? null : Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive);
var compiledSuffixPattern = Patterns.getCompiledPattern(suffixPattern, patternCaseInsensitive);
Set<Entity> expanded = new HashSet<>();
for (Entity entity : entities) {
@ -334,16 +329,16 @@ public class Section {
}
if (valuePattern != null) {
Matcher valueMatcher = compiledValuePattern.matcher(entity.getWord());
var valueMatcher = compiledValuePattern.matcher(entity.getWord());
if (!valueMatcher.matches()) {
continue;
}
}
Matcher matcher = compiledPattern.matcher(entity.getTextAfter());
var matcher = compiledSuffixPattern.matcher(entity.getTextAfter());
while (matcher.find()) {
String match = matcher.group(group);
var match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
@ -352,7 +347,7 @@ public class Section {
continue;
}
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE, false);
var expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE, false);
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
}
}

View File

@ -286,12 +286,16 @@ public class EntitySearchUtils {
}
}
/*
* returns true if the found entity overlaps with an existing entity in a way
* that neither entity is a subset of the other
*/
private boolean overlaps(Set<Entity> existingEntities, Entity found) {
for (Entity existing : existingEntities) {
if (existing.getStart().equals(found.getStart())) {
// skip if either start or end is equal
if (existing.getStart().equals(found.getStart()) || existing.getEnd().equals(found.getEnd())) {
continue;
}

View File

@ -303,6 +303,8 @@ public class RedactionIntegrationTest {
@Test
public void redactionExpansionOverlap() throws IOException {
// F. Lastname, J. Doe, M. Mustermann
// Lastname M., Doe J., Mustermann M.
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/ExpansionTest.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
@ -1534,6 +1536,44 @@ public class RedactionIntegrationTest {
}
}
@Test
public void testExpandByPrefixRegEx() throws IOException {
assertThat(dictionary.get(AUTHOR).contains("Robinson"));
assertThat(! dictionary.get(AUTHOR).contains("Mrs. Robinson"));
assertThat(dictionary.get(AUTHOR).contains("Bojangles"));
assertThat(! dictionary.get(AUTHOR).contains("Mr. Bojangles"));
assertThat(dictionary.get(AUTHOR).contains("Tambourine Man"));
assertThat(! dictionary.get(AUTHOR).contains("Mr. Tambourine Man"));
String fileName = "files/mr-mrs.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
analyzeService.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var values = redactionLog.getRedactionLogEntry()
.stream()
.map(RedactionLogEntry::getValue)
.collect(Collectors.toList());
assertThat(values).contains("Mrs. Robinson");
assertThat(values).contains("Mr. Bojangles");
assertThat(values).contains("Mr. Tambourine Man");
}
@SneakyThrows
private AnalyzeRequest prepareStorage(InputStream stream) {

View File

@ -145,14 +145,17 @@ public class EntitySearchUtilsTest {
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
Entity foundEntities3 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
foundEntities.add(foundEntities3);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(0);
assertThat(result.size()).isEqualTo(1);
assertThat(result.contains(foundEntities2));
}
@ -173,16 +176,21 @@ public class EntitySearchUtilsTest {
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
Entity foundEntitiesOverlap1 = new Entity("Batman X. Superman Y.", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
Entity foundEntitiesOverlap2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
Entity foundEntitiesSubset1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
Entity foundEntitiesSubset2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
foundEntities.add(foundEntitiesOverlap1);
foundEntities.add(foundEntitiesOverlap2);
foundEntities.add(foundEntitiesSubset1);
foundEntities.add(foundEntitiesSubset2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(0);
assertThat(result.size()).isEqualTo(2);
assertThat(result).containsExactlyInAnyOrder(foundEntitiesSubset1, foundEntitiesSubset2);
}

View File

@ -8620,4 +8620,6 @@ Doe
M. Mustermann
F. Lastname
Mustermann
Lastname
Lastname
Bojangles
Tambourine Man

View File

@ -36,6 +36,23 @@ rule "0: Expand CBI Authors with firstname initials"
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
end
rule "0: Expand CBI_author and PII matches with salutation prefix"
when
Section((matchesType("CBI_author") || matchesType("PII")) && (
searchText.contains("Mr")
|| searchText.contains("Mrs")
|| searchText.contains("Ms")
|| searchText.contains("Miss")
|| searchText.contains("Sir")
|| searchText.contains("Madam")
|| searchText.contains("Madame")
|| searchText.contains("Mme")
))
then
section.expandByPrefixRegEx("CBI_author", "\\b(Mrs?|Ms|Miss|Sir|Madame?|Mme)\\s?\\.?\\s*", false, 0);
section.expandByPrefixRegEx("PII", "\\b(Mrs?|Ms|Miss|Sir|Madame?|Mme)\\s?\\.?\\s*", false, 0);
end
rule "1: Redacted because Section contains Vertebrate"
when