Pull request #371: RED-3761 Fix expandByRegEx and expandByPrefixRegEx
Merge in RED/redaction-service from kbudisantoso/RED-3761-fix-filter-expanded to master * commit '5b5d898b933c3ae79fb7cb07a6105b15d2952f8e': RED-3761 Fix expandByRegEx and expandByPrefixRegEx
This commit is contained in:
commit
efd53066c4
@ -261,9 +261,7 @@ public class Section {
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group,
|
||||
@Argument(ArgumentType.REGEX) String valuePattern) {
|
||||
|
||||
if (StringUtils.isEmpty(prefixPattern)) {
|
||||
return;
|
||||
}
|
||||
if (StringUtils.isEmpty(prefixPattern)) return;
|
||||
|
||||
var compiledValuePattern = valuePattern == null ? null : Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive);
|
||||
var compiledPrefixPattern = Patterns.getCompiledPattern(prefixPattern, patternCaseInsensitive);
|
||||
@ -318,13 +316,10 @@ public class Section {
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group,
|
||||
@Argument(ArgumentType.REGEX) String valuePattern) {
|
||||
|
||||
Pattern compiledValuePattern = null;
|
||||
if (StringUtils.isEmpty(suffixPattern)) return;
|
||||
|
||||
if (valuePattern != null) {
|
||||
compiledValuePattern = Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive);
|
||||
}
|
||||
|
||||
Pattern compiledPattern = Patterns.getCompiledPattern(suffixPattern, patternCaseInsensitive);
|
||||
var compiledValuePattern = valuePattern == null ? null : Patterns.getCompiledPattern(valuePattern, patternCaseInsensitive);
|
||||
var compiledSuffixPattern = Patterns.getCompiledPattern(suffixPattern, patternCaseInsensitive);
|
||||
|
||||
Set<Entity> expanded = new HashSet<>();
|
||||
for (Entity entity : entities) {
|
||||
@ -334,16 +329,16 @@ public class Section {
|
||||
}
|
||||
|
||||
if (valuePattern != null) {
|
||||
Matcher valueMatcher = compiledValuePattern.matcher(entity.getWord());
|
||||
var valueMatcher = compiledValuePattern.matcher(entity.getWord());
|
||||
if (!valueMatcher.matches()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Matcher matcher = compiledPattern.matcher(entity.getTextAfter());
|
||||
var matcher = compiledSuffixPattern.matcher(entity.getTextAfter());
|
||||
|
||||
while (matcher.find()) {
|
||||
String match = matcher.group(group);
|
||||
var match = matcher.group(group);
|
||||
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
|
||||
@ -352,7 +347,7 @@ public class Section {
|
||||
continue;
|
||||
}
|
||||
|
||||
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE, false);
|
||||
var expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE, false);
|
||||
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
|
||||
}
|
||||
}
|
||||
|
||||
@ -286,12 +286,16 @@ public class EntitySearchUtils {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* returns true if the found entity overlaps with an existing entity in a way
|
||||
* that neither entity is a subset of the other
|
||||
*/
|
||||
private boolean overlaps(Set<Entity> existingEntities, Entity found) {
|
||||
|
||||
for (Entity existing : existingEntities) {
|
||||
|
||||
if (existing.getStart().equals(found.getStart())) {
|
||||
// skip if either start or end is equal
|
||||
if (existing.getStart().equals(found.getStart()) || existing.getEnd().equals(found.getEnd())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -303,6 +303,8 @@ public class RedactionIntegrationTest {
|
||||
@Test
|
||||
public void redactionExpansionOverlap() throws IOException {
|
||||
|
||||
// F. Lastname, J. Doe, M. Mustermann
|
||||
// Lastname M., Doe J., Mustermann M.
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/ExpansionTest.pdf");
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
@ -1534,6 +1536,44 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpandByPrefixRegEx() throws IOException {
|
||||
|
||||
assertThat(dictionary.get(AUTHOR).contains("Robinson"));
|
||||
assertThat(! dictionary.get(AUTHOR).contains("Mrs. Robinson"));
|
||||
assertThat(dictionary.get(AUTHOR).contains("Bojangles"));
|
||||
assertThat(! dictionary.get(AUTHOR).contains("Mr. Bojangles"));
|
||||
assertThat(dictionary.get(AUTHOR).contains("Tambourine Man"));
|
||||
assertThat(! dictionary.get(AUTHOR).contains("Mr. Tambourine Man"));
|
||||
|
||||
String fileName = "files/mr-mrs.pdf";
|
||||
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
analyzeService.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.dossierId(TEST_DOSSIER_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||
var values = redactionLog.getRedactionLogEntry()
|
||||
.stream()
|
||||
.map(RedactionLogEntry::getValue)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertThat(values).contains("Mrs. Robinson");
|
||||
assertThat(values).contains("Mr. Bojangles");
|
||||
assertThat(values).contains("Mr. Tambourine Man");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private AnalyzeRequest prepareStorage(InputStream stream) {
|
||||
|
||||
@ -145,14 +145,17 @@ public class EntitySearchUtilsTest {
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
Entity foundEntities3 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
foundEntities.add(foundEntities3);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(0);
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result.contains(foundEntities2));
|
||||
|
||||
}
|
||||
|
||||
@ -173,16 +176,21 @@ public class EntitySearchUtilsTest {
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
Entity foundEntitiesOverlap1 = new Entity("Batman X. Superman Y.", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
Entity foundEntitiesOverlap2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
Entity foundEntitiesSubset1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
Entity foundEntitiesSubset2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE, EntityType.ENTITY);
|
||||
foundEntities.add(foundEntitiesOverlap1);
|
||||
foundEntities.add(foundEntitiesOverlap2);
|
||||
foundEntities.add(foundEntitiesSubset1);
|
||||
foundEntities.add(foundEntitiesSubset2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(0);
|
||||
assertThat(result.size()).isEqualTo(2);
|
||||
assertThat(result).containsExactlyInAnyOrder(foundEntitiesSubset1, foundEntitiesSubset2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -8620,4 +8620,6 @@ Doe
|
||||
M. Mustermann
|
||||
F. Lastname
|
||||
Mustermann
|
||||
Lastname
|
||||
Lastname
|
||||
Bojangles
|
||||
Tambourine Man
|
||||
@ -36,6 +36,23 @@ rule "0: Expand CBI Authors with firstname initials"
|
||||
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
|
||||
end
|
||||
|
||||
rule "0: Expand CBI_author and PII matches with salutation prefix"
|
||||
when
|
||||
Section((matchesType("CBI_author") || matchesType("PII")) && (
|
||||
searchText.contains("Mr")
|
||||
|| searchText.contains("Mrs")
|
||||
|| searchText.contains("Ms")
|
||||
|| searchText.contains("Miss")
|
||||
|| searchText.contains("Sir")
|
||||
|| searchText.contains("Madam")
|
||||
|| searchText.contains("Madame")
|
||||
|| searchText.contains("Mme")
|
||||
))
|
||||
then
|
||||
section.expandByPrefixRegEx("CBI_author", "\\b(Mrs?|Ms|Miss|Sir|Madame?|Mme)\\s?\\.?\\s*", false, 0);
|
||||
section.expandByPrefixRegEx("PII", "\\b(Mrs?|Ms|Miss|Sir|Madame?|Mme)\\s?\\.?\\s*", false, 0);
|
||||
end
|
||||
|
||||
|
||||
rule "1: Redacted because Section contains Vertebrate"
|
||||
when
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user