Pull request #14: RED-207: Match caseInsensitive dictionaries caseInSensitive

Merge in RED/redaction-service from RED-207 to master

* commit '135a715e22e6c2536b268db29161552cfd7a6c1c':
  Fixed style in EnityRedactionService
  Fixed wrong naming of caseInsensitive
  RED-207: Match caseInsensitive dictionaries caseInSensitive
This commit is contained in:
Dominique Eiflaender 2020-07-27 13:52:40 +02:00
commit b7ee62f44d
7 changed files with 142 additions and 61 deletions

View File

@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence {
return text.charAt(0); return text.charAt(0);
} }
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@Override @Override
public TextPositionSequence subSequence(int start, int end) { public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page); return new TextPositionSequence(textPositions.subList(start, end), page);

View File

@ -13,45 +13,65 @@ public class SearchableText {
private List<TextPositionSequence> sequences = new ArrayList<>(); private List<TextPositionSequence> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) { public void add(TextPositionSequence textPositionSequence) {
sequences.add(textPositionSequence); sequences.add(textPositionSequence);
} }
public void addAll(List<TextPositionSequence> textPositionSequences) { public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(textPositionSequences); sequences.addAll(textPositionSequences);
} }
public List<EntityPositionSequence> getSequences(String searchString) { public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive) {
char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray(); String normalizedSearchString;
if (caseInsensitive) {
normalizedSearchString = searchString.toLowerCase();
} else {
normalizedSearchString = searchString;
}
char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray();
int counter = 0; int counter = 0;
List<TextPositionSequence> crossSequenceParts = new ArrayList<>(); List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
List<EntityPositionSequence> finalMatches = new ArrayList<>(); List<EntityPositionSequence> finalMatches = new ArrayList<>();
for (int i = 0; i < sequences.size(); i++) { for (int i = 0; i < sequences.size(); i++) {
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage()); TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
for (int j = 0; j < sequences.get(i).length(); j++) { for (int j = 0; j < sequences.get(i).length(); j++) {
if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' ' if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1)
|| j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){ .charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i)
if(j == sequences.get(i).length() -1 && counter != 0 && !partMatch.getTextPositions().isEmpty()){ .charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') {
if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
crossSequenceParts.add(partMatch); crossSequenceParts.add(partMatch);
} }
continue; continue;
} }
if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){ if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1)
.charAt(sequences.get(i - 1)
.length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') {
counter++; counter++;
} }
if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') { if (sequences.get(i)
.charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
.charAt(j, caseInsensitive) == '-') {
if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1)) if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
|| j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') { .charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
.charAt(sequences.get(i - 1)
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i)
.charAt(j, caseInsensitive) != ' ') {
partMatch.add(sequences.get(i).textPositionAt(j)); partMatch.add(sequences.get(i).textPositionAt(j));
if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) { if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
.charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) {
counter++; counter++;
} }
} }
@ -59,10 +79,13 @@ public class SearchableText {
if (counter == searchString.length()) { if (counter == searchString.length()) {
crossSequenceParts.add(partMatch); crossSequenceParts.add(partMatch);
if(i == sequences.size() - 1 && j == sequences.get(i).length() -1 if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
|| j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1)) .length() - 1 && isSeparator(sequences.get(i)
|| j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0)) .charAt(j + 1, caseInsensitive)) || j == sequences.get(i)
|| j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') { .length() - 1 && isSeparator(sequences.get(i + 1)
.charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
.charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1)
.charAt(0, caseInsensitive) != ' ') {
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts)); finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
} }
@ -72,14 +95,14 @@ public class SearchableText {
} }
} else { } else {
counter = 0; counter = 0;
if(!crossSequenceParts.isEmpty()){ if (!crossSequenceParts.isEmpty()) {
j--; j--;
} }
crossSequenceParts = new ArrayList<>(); crossSequenceParts = new ArrayList<>();
partMatch = new TextPositionSequence(sequences.get(i).getPage()); partMatch = new TextPositionSequence(sequences.get(i).getPage());
} }
if(j == sequences.get(i).length() -1 && counter != 0){ if (j == sequences.get(i).length() - 1 && counter != 0) {
crossSequenceParts.add(partMatch); crossSequenceParts.add(partMatch);
} }
} }
@ -89,18 +112,18 @@ public class SearchableText {
} }
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts){ private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
UUID id = UUID.randomUUID(); UUID id = UUID.randomUUID();
List<EntityPositionSequence> result = new ArrayList<>(); List<EntityPositionSequence> result = new ArrayList<>();
int currentPage = -1; int currentPage = -1;
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id); EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
for (TextPositionSequence textPositionSequence :crossSequenceParts){ for (TextPositionSequence textPositionSequence : crossSequenceParts) {
if(currentPage == -1){ if (currentPage == -1) {
currentPage = textPositionSequence.getPage(); currentPage = textPositionSequence.getPage();
entityPositionSequence.setPageNumber(currentPage); entityPositionSequence.setPageNumber(currentPage);
entityPositionSequence.getSequences().add(textPositionSequence); entityPositionSequence.getSequences().add(textPositionSequence);
} else if(currentPage == textPositionSequence.getPage()){ } else if (currentPage == textPositionSequence.getPage()) {
entityPositionSequence.getSequences().add(textPositionSequence); entityPositionSequence.getSequences().add(textPositionSequence);
} else { } else {
result.add(entityPositionSequence); result.add(entityPositionSequence);
@ -114,13 +137,14 @@ public class SearchableText {
private boolean isSeparator(char c) { private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == ''; return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
} }
@Override @Override
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null; TextPositionSequence previous = null;
@ -137,10 +161,14 @@ public class SearchableText {
previous = word; previous = word;
} }
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " "); return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
.replaceAll("\n", " ")
.replaceAll(" ", " ");
} }
public String getAsStringWithLinebreaks(){
public String getAsStringWithLinebreaks() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null; TextPositionSequence previous = null;

View File

@ -60,10 +60,6 @@ public class DictionaryService {
entryColors = typeResponse.getTypes() entryColors = typeResponse.getTypes()
.stream() .stream()
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor)); .collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
dictionary = entryColors.keySet()
.stream()
.collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s)
.getEntries())));
hintTypes = typeResponse.getTypes() hintTypes = typeResponse.getTypes()
.stream() .stream()
.filter(TypeResult::isHint) .filter(TypeResult::isHint)
@ -74,6 +70,7 @@ public class DictionaryService {
.filter(TypeResult::isCaseInsensitive) .filter(TypeResult::isCaseInsensitive)
.map(TypeResult::getType) .map(TypeResult::getType)
.collect(Collectors.toList()); .collect(Collectors.toList());
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
} }
} catch (FeignException e) { } catch (FeignException e) {
log.warn("Got some unknown feignException", e); log.warn("Got some unknown feignException", e);
@ -81,4 +78,16 @@ public class DictionaryService {
} }
} }
private Set<String> convertEntries(String s) {
if (caseInsensitiveTypes.contains(s)) {
return dictionaryClient.getDictionaryForType(s)
.getEntries()
.stream()
.map(String::toLowerCase)
.collect(Collectors.toSet());
}
return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries());
}
} }

View File

@ -27,6 +27,7 @@ public class EntityRedactionService {
private final DictionaryService dictionaryService; private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService; private final DroolsExecutionService droolsExecutionService;
public void processDocument(Document classifiedDoc) { public void processDocument(Document classifiedDoc) {
dictionaryService.updateDictionary(); dictionaryService.updateDictionary();
@ -56,8 +57,7 @@ public class EntityRedactionService {
} }
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline()); Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
Section analysedSection = droolsExecutionService.executeRules(Section Section analysedSection = droolsExecutionService.executeRules(Section.builder()
.builder()
.entities(entities) .entities(entities)
.text(searchableText.getAsStringWithLinebreaks()) .text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString()) .searchText(searchableText.toString())
@ -65,7 +65,11 @@ public class EntityRedactionService {
.build()); .build());
for (Entity entity : analysedSection.getEntities()) { for (Entity entity : analysedSection.getEntities()) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord())); if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
}
} }
documentEntities.addAll(analysedSection.getEntities()); documentEntities.addAll(analysedSection.getEntities());
@ -73,8 +77,7 @@ public class EntityRedactionService {
for (SearchableText searchableRow : searchableRows) { for (SearchableText searchableRow : searchableRows) {
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader"); Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
Section analysedRowSection = droolsExecutionService.executeRules(Section Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.builder()
.entities(rowEntities) .entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks()) .text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString()) .searchText(searchableRow.toString())
@ -82,7 +85,11 @@ public class EntityRedactionService {
.build()); .build());
for (Entity entity : analysedRowSection.getEntities()) { for (Entity entity : analysedRowSection.getEntities()) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord())); if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
}
} }
documentEntities.addAll(analysedRowSection.getEntities()); documentEntities.addAll(analysedRowSection.getEntities());
} }
@ -90,32 +97,27 @@ public class EntityRedactionService {
documentEntities.forEach(entity -> { documentEntities.forEach(entity -> {
entity.getPositionSequences().forEach(sequence -> { entity.getPositionSequences().forEach(sequence -> {
classifiedDoc.getEntities().computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>()).add( classifiedDoc.getEntities()
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence), entity.getHeadline(), entity.getMatchedRule()) .computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>())
); .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List
.of(sequence), entity.getHeadline(), entity.getMatchedRule()));
}); });
}); });
} }
private Set<Entity> findEntities(SearchableText searchableText, String headline) { private Set<Entity> findEntities(SearchableText searchableText, String headline) {
String normalizedInputString = searchableText.toString(); String inputString = searchableText.toString();
String lowercaseInputString = inputString.toLowerCase();
Set<Entity> found = new HashSet<>(); Set<Entity> found = new HashSet<>();
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) { for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
for (String value : entry.getValue()) {
int startIndex;
int stopIndex = 0;
do {
startIndex = normalizedInputString.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 && if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) && found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) { } else {
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline)); found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
}
} while (startIndex > -1);
} }
} }
@ -124,19 +126,45 @@ public class EntityRedactionService {
return found; return found;
} }
private Set<Entity> find(String inputString, Set<String> values, String type, String headline) {
Set<Entity> found = new HashSet<>();
for (String value : values) {
int startIndex;
int stopIndex = 0;
do {
startIndex = inputString.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
}
} while (startIndex > -1);
}
return found;
}
private boolean isSeparator(char c) { private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == ''; return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
} }
public void removeEntitiesContainedInLarger(Set<Entity> entities) { public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>(); List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) { for (Entity word : entities) {
for (Entity inner : entities) { for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord().length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) { if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
wordsToRemove.add(inner); wordsToRemove.add(inner);
} }
} }
} }
entities.removeAll(wordsToRemove); entities.removeAll(wordsToRemove);
} }
} }

View File

@ -71,6 +71,7 @@ public class RedactionIntegrationTest {
private final Map<String, List<String>> dictionary = new HashMap<>(); private final Map<String, List<String>> dictionary = new HashMap<>();
private final Map<String, float[]> typeColorMap = new HashMap<>(); private final Map<String, float[]> typeColorMap = new HashMap<>();
private final Map<String, Boolean> hintTypeMap = new HashMap<>(); private final Map<String, Boolean> hintTypeMap = new HashMap<>();
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
@TestConfiguration @TestConfiguration
public static class RedactionIntegrationTestConfiguration { public static class RedactionIntegrationTestConfiguration {
@ -82,7 +83,8 @@ public class RedactionIntegrationTest {
KieFileSystem kieFileSystem = kieServices.newKieFileSystem(); KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8)); InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input)); kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
.newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem); KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll(); kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule(); KieModule kieModule = kieBuilder.getKieModule();
@ -156,6 +158,12 @@ public class RedactionIntegrationTest {
hintTypeMap.put(NAME_CODE, false); hintTypeMap.put(NAME_CODE, false);
hintTypeMap.put(NO_REDACTION_INDICATOR, true); hintTypeMap.put(NO_REDACTION_INDICATOR, true);
hintTypeMap.put(DEFAULT, true); hintTypeMap.put(DEFAULT, true);
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
caseInSensitiveMap.put(ADDRESS_CODE, false);
caseInSensitiveMap.put(NAME_CODE, false);
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
caseInSensitiveMap.put(DEFAULT, true);
} }
@ -166,14 +174,22 @@ public class RedactionIntegrationTest {
.map(typeColor -> TypeResult.builder() .map(typeColor -> TypeResult.builder()
.type(typeColor.getKey()) .type(typeColor.getKey())
.color(typeColor.getValue()) .color(typeColor.getValue())
.isHint(hintTypeMap.get(typeColor.getKey())).build()) .isHint(hintTypeMap.get(typeColor.getKey()))
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
.build())
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
private DictionaryResponse getDictionaryResponse(String type) { private DictionaryResponse getDictionaryResponse(String type) {
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build(); return DictionaryResponse.builder()
.color(typeColorMap.get(type))
.entries(dictionary.get(type))
.isHint(hintTypeMap.get(type))
.isCaseInsensitive(caseInSensitiveMap.get(type))
.build();
} }

View File

@ -100,15 +100,11 @@ Pseudacris triseriata
poecilia reticulata poecilia reticulata
poultry poultry
quail quail
rabbit
rabbits
rainbow trout rainbow trout
Rana limnocharis Rana limnocharis
rana rana
limnocharis limnocharis
rana pipiens rana pipiens
rat
rats
reptile reptile
reptiles reptiles
ricefish ricefish