RED-10046 The punctuation mark “.” should be treated as a word boundary when...

This commit is contained in:
Corina Olariu 2024-11-21 10:20:44 +01:00
parent 213d3bf645
commit dfd262e9e1
8 changed files with 28 additions and 4 deletions

View File

@ -55,6 +55,7 @@ public final class SeparatorUtils {
return textRange.end() == textBlock.getTextRange().end() ||//
SeparatorUtils.isSeparator(textBlock.charAt(textRange.end())) ||//
SeparatorUtils.isSeparator(textBlock.charAt(textRange.end() - 1)) ||//
SeparatorUtils.isJapaneseSeparator(textBlock.charAt(textRange.end() - 1));
}

View File

@ -20,6 +20,18 @@ public class SearchImplementationTest extends BuildDocumentIntegrationTest {
private EntityEnrichmentService entityEnrichmentService;
@Test
public void testSearchImplementationWithPunctuation() {
Document document = buildGraph("files/Minimal Examples/TestPunctuation");
SearchImplementation searchImplementation = new SearchImplementation(List.of("Kuhn, J. O."), true);
EntityCreationService entityCreationService = new EntityCreationService(entityEnrichmentService);
List<TextEntity> entities = entityCreationService.bySearchImplementation(searchImplementation, "CBI_author", EntityType.ENTITY, document)
.toList();
assertEquals(2, entities.size());
}
@Test
public void testSearchImplementationWithSingleEntry() {

View File

@ -100,6 +100,17 @@ public class PrecursorEntityTest extends BuildDocumentIntegrationTest {
assertTrue(context.entity().removed());
}
@Test
public void createFoundManualRedaction2() {
Document document = buildGraph("files/Minimal Examples/TestPunctuation");
EntityCreationService entityCreationService = new EntityCreationService(entityEnrichmentService);
List<TextEntity> tempEntities = entityCreationService.byString("Kuhn, J. O.", "CBI_author", EntityType.ENTITY, document)
.toList();
assertFalse(tempEntities.isEmpty());
assertEquals(2, tempEntities.size());
}
private DocumentAndEntity createNotFoundManualRedaction() {

View File

@ -856,7 +856,7 @@ rule "PII.9.3: Redact between \"AUTHOR(S)\" and \"(STUDY) COMPLETION DATE\""
when
$document: Document(containsStringIgnoreCase("AUTHOR(S)"), containsAnyStringIgnoreCase("COMPLETION DATE", "STUDY COMPLETION DATE"))
then
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document)
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document, 200)
.forEach(authorEntity -> authorEntity.redact("PII.9.3", "AUTHOR(S) was found", "personal_data_geolocation"));
end

View File

@ -1416,7 +1416,7 @@ rule "PII.9.3: Redact between \"AUTHOR(S)\" and \"(STUDY) COMPLETION DATE\""
when
$document: Document(containsStringIgnoreCase("AUTHOR(S)"), containsAnyStringIgnoreCase("COMPLETION DATE", "STUDY COMPLETION DATE"))
then
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document)
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document, 200)
.forEach(authorEntity -> authorEntity.redact("PII.9.3", "AUTHOR(S) was found", "personal_data_geolocation"));
end

View File

@ -837,7 +837,7 @@ rule "PII.9.3: Redact between \"AUTHOR(S)\" and \"(STUDY) COMPLETION DATE\""
when
$document: Document(containsStringIgnoreCase("AUTHOR(S)"), containsAnyStringIgnoreCase("COMPLETION DATE", "STUDY COMPLETION DATE"))
then
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document)
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document, 200)
.forEach(authorEntity -> authorEntity.redact("PII.9.3", "AUTHOR(S) was found", "personal_data_geolocation"));
end

View File

@ -1430,7 +1430,7 @@ rule "PII.9.3: Redact between \"AUTHOR(S)\" and \"(STUDY) COMPLETION DATE\""
when
$document: Document(containsStringIgnoreCase("AUTHOR(S)"), containsAnyStringIgnoreCase("COMPLETION DATE", "STUDY COMPLETION DATE"))
then
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document)
entityCreationService.shortestBetweenAnyStringIgnoreCase(List.of("AUTHOR(S)", "AUTHOR(S):"), List.of("COMPLETION DATE", "COMPLETION DATE:", "STUDY COMPLETION DATE", "STUDY COMPLETION DATE:"), "PII", EntityType.ENTITY, $document, 200)
.forEach(authorEntity -> authorEntity.redact("PII.9.3", "AUTHOR(S) was found", "personal_data_geolocation"));
end