Pull request #115: Expand CBI Authors with firstname initials

Merge in RED/redaction-service from ExpandAuthors to master

* commit '35f3582d0850053c1dc38ef0b149daa47bd9cc56':
  Expand CBI Authors with firstname initials
This commit is contained in:
Dominique Eiflaender 2021-02-02 12:35:07 +01:00
commit 7898f6a30f
5 changed files with 76 additions and 7 deletions

View File

@ -62,9 +62,10 @@ public class Section {
}
public boolean hasTableHeader(String headerName){
String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
return tabularData != null && tabularData.containsKey(cleanHeaderName);
public boolean hasTableHeader(String headerName) {
String cleanHeaderName = headerName.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
return tabularData != null && tabularData.containsKey(cleanHeaderName);
}
@ -80,6 +81,34 @@ public class Section {
}
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
Set<Entity> expanded = new HashSet<>();
for (Entity entity : entities) {
if (!entity.getType().equals(type) || entity.getTextAfter() == null) {
continue;
}
Matcher matcher = compiledPattern.matcher(entity.getTextAfter());
while (matcher.find()) {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity
.getRedactionReason(), entity.getLegalBasis()));
}
}
}
EntitySearchUtils.addEntitiesWithHigherRank(entities, expanded, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
public void redact(String type, int ruleNumber, String reason, String legalBasis) {
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
@ -160,7 +189,7 @@ public class Section {
String trimmedValue = value.trim();
String cleanValue;
if(trimmedValue.startsWith(":")){
if (trimmedValue.startsWith(":")) {
cleanValue = trimmedValue.substring(1).trim();
} else {
cleanValue = trimmedValue;
@ -207,8 +236,8 @@ public class Section {
}
public void redactAndRecommendByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType, int ruleNumber,
String reason, String legalBasis) {
public void redactAndRecommendByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType,
int ruleNumber, String reason, String legalBasis) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
@ -363,6 +392,7 @@ public class Section {
}
}
}
}

View File

@ -441,7 +441,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_08_Volume_3CA_B-6_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)

View File

@ -1,15 +1,36 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import org.apache.commons.lang3.StringUtils;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
public class RegExPatternTest {
@Test
public void testExpand(){
String pattern = "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)";
String text = ", G.R., Simoneaux,";
Pattern compiledPattern = Pattern.compile(pattern, 0);
Matcher matcher = compiledPattern.matcher(text);
while (matcher.find()) {
String match = matcher.group(1);
if (StringUtils.isNotBlank(match)) {
System.out.println(match);
}
}
}
@Test
public void testEmailRegEx(){
String text = "Address: Schwarzwaldalle " +

View File

@ -7,6 +7,15 @@ global Section section
// --------------------------------------- CBI rules -------------------------------------------------------------------
rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
then
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
end
rule "1: Redact CBI Authors"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))

View File

@ -7,6 +7,15 @@ global Section section
// --------------------------------------- CBI rules -------------------------------------------------------------------
rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
then
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
end
rule "1: Redacted because Section contains Vertebrate"
when
Section(matchesType("vertebrate"))