Pull request #14: RED-207: Match caseInsensitive dictionaries caseInSensitive

Merge in RED/redaction-service from RED-207 to master

* commit '135a715e22e6c2536b268db29161552cfd7a6c1c':
  Fixed style in EnityRedactionService
  Fixed wrong naming of caseInsensitive
  RED-207: Match caseInsensitive dictionaries caseInSensitive
This commit is contained in:
Dominique Eiflaender 2020-07-27 13:52:40 +02:00
commit b7ee62f44d
7 changed files with 142 additions and 61 deletions

View File

@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence {
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page);

View File

@ -13,45 +13,65 @@ public class SearchableText {
private List<TextPositionSequence> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) {
sequences.add(textPositionSequence);
}
public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(textPositionSequences);
}
public List<EntityPositionSequence> getSequences(String searchString) {
public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive) {
char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray();
String normalizedSearchString;
if (caseInsensitive) {
normalizedSearchString = searchString.toLowerCase();
} else {
normalizedSearchString = searchString;
}
char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray();
int counter = 0;
List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
List<EntityPositionSequence> finalMatches = new ArrayList<>();
for (int i = 0; i < sequences.size(); i++) {
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
for (int j = 0; j < sequences.get(i).length(); j++) {
if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' '
|| j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){
if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1)
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i)
.charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') {
if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
crossSequenceParts.add(partMatch);
}
continue;
}
if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){
if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1)
.charAt(sequences.get(i - 1)
.length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') {
counter++;
}
if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') {
if (sequences.get(i)
.charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
.charAt(j, caseInsensitive) == '-') {
if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1))
|| j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') {
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
.charAt(sequences.get(i - 1)
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i)
.charAt(j, caseInsensitive) != ' ') {
partMatch.add(sequences.get(i).textPositionAt(j));
if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) {
if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
.charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) {
counter++;
}
}
@ -59,10 +79,13 @@ public class SearchableText {
if (counter == searchString.length()) {
crossSequenceParts.add(partMatch);
if(i == sequences.size() - 1 && j == sequences.get(i).length() -1
|| j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1))
|| j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0))
|| j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') {
if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
.length() - 1 && isSeparator(sequences.get(i)
.charAt(j + 1, caseInsensitive)) || j == sequences.get(i)
.length() - 1 && isSeparator(sequences.get(i + 1)
.charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
.charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1)
.charAt(0, caseInsensitive) != ' ') {
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
}
@ -114,13 +137,14 @@ public class SearchableText {
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
@ -137,10 +161,14 @@ public class SearchableText {
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " ");
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
.replaceAll("\n", " ")
.replaceAll(" ", " ");
}
public String getAsStringWithLinebreaks() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;

View File

@ -60,10 +60,6 @@ public class DictionaryService {
entryColors = typeResponse.getTypes()
.stream()
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
dictionary = entryColors.keySet()
.stream()
.collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s)
.getEntries())));
hintTypes = typeResponse.getTypes()
.stream()
.filter(TypeResult::isHint)
@ -74,6 +70,7 @@ public class DictionaryService {
.filter(TypeResult::isCaseInsensitive)
.map(TypeResult::getType)
.collect(Collectors.toList());
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
}
} catch (FeignException e) {
log.warn("Got some unknown feignException", e);
@ -81,4 +78,16 @@ public class DictionaryService {
}
}
private Set<String> convertEntries(String s) {
if (caseInsensitiveTypes.contains(s)) {
return dictionaryClient.getDictionaryForType(s)
.getEntries()
.stream()
.map(String::toLowerCase)
.collect(Collectors.toSet());
}
return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries());
}
}

View File

@ -27,6 +27,7 @@ public class EntityRedactionService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
public void processDocument(Document classifiedDoc) {
dictionaryService.updateDictionary();
@ -56,8 +57,7 @@ public class EntityRedactionService {
}
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
Section analysedSection = droolsExecutionService.executeRules(Section
.builder()
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
.entities(entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
@ -65,7 +65,11 @@ public class EntityRedactionService {
.build());
for (Entity entity : analysedSection.getEntities()) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord()));
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedSection.getEntities());
@ -73,8 +77,7 @@ public class EntityRedactionService {
for (SearchableText searchableRow : searchableRows) {
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
Section analysedRowSection = droolsExecutionService.executeRules(Section
.builder()
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
@ -82,7 +85,11 @@ public class EntityRedactionService {
.build());
for (Entity entity : analysedRowSection.getEntities()) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord()));
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedRowSection.getEntities());
}
@ -90,32 +97,27 @@ public class EntityRedactionService {
documentEntities.forEach(entity -> {
entity.getPositionSequences().forEach(sequence -> {
classifiedDoc.getEntities().computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>()).add(
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence), entity.getHeadline(), entity.getMatchedRule())
);
classifiedDoc.getEntities()
.computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List
.of(sequence), entity.getHeadline(), entity.getMatchedRule()));
});
});
}
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
String normalizedInputString = searchableText.toString();
String inputString = searchableText.toString();
String lowercaseInputString = inputString.toLowerCase();
Set<Entity> found = new HashSet<>();
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
for (String value : entry.getValue()) {
int startIndex;
int stopIndex = 0;
do {
startIndex = normalizedInputString.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 &&
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
}
} while (startIndex > -1);
if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
} else {
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
}
}
@ -124,19 +126,45 @@ public class EntityRedactionService {
return found;
}
private Set<Entity> find(String inputString, Set<String> values, String type, String headline) {
Set<Entity> found = new HashSet<>();
for (String value : values) {
int startIndex;
int stopIndex = 0;
do {
startIndex = inputString.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
}
} while (startIndex > -1);
}
return found;
}
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord().length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
wordsToRemove.add(inner);
}
}
}
entities.removeAll(wordsToRemove);
}
}

View File

@ -71,6 +71,7 @@ public class RedactionIntegrationTest {
private final Map<String, List<String>> dictionary = new HashMap<>();
private final Map<String, float[]> typeColorMap = new HashMap<>();
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
@TestConfiguration
public static class RedactionIntegrationTestConfiguration {
@ -82,7 +83,8 @@ public class RedactionIntegrationTest {
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
.newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
@ -156,6 +158,12 @@ public class RedactionIntegrationTest {
hintTypeMap.put(NAME_CODE, false);
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
hintTypeMap.put(DEFAULT, true);
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
caseInSensitiveMap.put(ADDRESS_CODE, false);
caseInSensitiveMap.put(NAME_CODE, false);
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
caseInSensitiveMap.put(DEFAULT, true);
}
@ -166,14 +174,22 @@ public class RedactionIntegrationTest {
.map(typeColor -> TypeResult.builder()
.type(typeColor.getKey())
.color(typeColor.getValue())
.isHint(hintTypeMap.get(typeColor.getKey())).build())
.isHint(hintTypeMap.get(typeColor.getKey()))
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
.build())
.collect(Collectors.toList());
}
private DictionaryResponse getDictionaryResponse(String type) {
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build();
return DictionaryResponse.builder()
.color(typeColorMap.get(type))
.entries(dictionary.get(type))
.isHint(hintTypeMap.get(type))
.isCaseInsensitive(caseInSensitiveMap.get(type))
.build();
}

View File

@ -100,15 +100,11 @@ Pseudacris triseriata
poecilia reticulata
poultry
quail
rabbit
rabbits
rainbow trout
Rana limnocharis
rana
limnocharis
rana pipiens
rat
rats
reptile
reptiles
ricefish