RED-207: Match caseInsensitive dictionaries caseInSensitive

This commit is contained in:
deiflaender 2020-07-27 13:19:47 +02:00
parent d282680cc8
commit f0e48087ff
7 changed files with 129 additions and 53 deletions

View File

@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence {
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page);

View File

@ -13,45 +13,65 @@ public class SearchableText {
private List<TextPositionSequence> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) {
sequences.add(textPositionSequence);
}
public void addAll(List<TextPositionSequence> textPositionSequences) {
sequences.addAll(textPositionSequences);
}
public List<EntityPositionSequence> getSequences(String searchString) {
public List<EntityPositionSequence> getSequences(String searchString, boolean caseInSensitive) {
char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray();
String normalizedSearchString;
if (caseInSensitive) {
normalizedSearchString = searchString.toLowerCase();
} else {
normalizedSearchString = searchString;
}
char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray();
int counter = 0;
List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
List<EntityPositionSequence> finalMatches = new ArrayList<>();
for (int i = 0; i < sequences.size(); i++) {
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
for (int j = 0; j < sequences.get(i).length(); j++) {
if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' '
|| j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){
if(j == sequences.get(i).length() -1 && counter != 0 && !partMatch.getTextPositions().isEmpty()){
if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInSensitive) == ' ' && sequences.get(i - 1)
.charAt(sequences.get(i - 1).length() - 1, caseInSensitive) == ' ' || j > 0 && sequences.get(i)
.charAt(j, caseInSensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInSensitive) == ' ') {
if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
crossSequenceParts.add(partMatch);
}
continue;
}
if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){
if (j == 0 && sequences.get(i).charAt(j, caseInSensitive) != ' ' && i != 0 && sequences.get(i - 1)
.charAt(sequences.get(i - 1)
.length() - 1, caseInSensitive) != ' ' && searchChars[counter] == ' ') {
counter++;
}
if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') {
if (sequences.get(i)
.charAt(j, caseInSensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
.charAt(j, caseInSensitive) == '-') {
if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1))
|| j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') {
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
.charAt(j - 1, caseInSensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
.charAt(sequences.get(i - 1)
.length() - 1, caseInSensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
.charAt(sequences.get(i - 1).length() - 1, caseInSensitive) != ' ' && sequences.get(i)
.charAt(j, caseInSensitive) != ' ') {
partMatch.add(sequences.get(i).textPositionAt(j));
if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) {
if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
.charAt(j, caseInSensitive) == '-' && searchChars[counter] != '-')) {
counter++;
}
}
@ -59,10 +79,13 @@ public class SearchableText {
if (counter == searchString.length()) {
crossSequenceParts.add(partMatch);
if(i == sequences.size() - 1 && j == sequences.get(i).length() -1
|| j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1))
|| j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0))
|| j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') {
if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
.length() - 1 && isSeparator(sequences.get(i)
.charAt(j + 1, caseInSensitive)) || j == sequences.get(i)
.length() - 1 && isSeparator(sequences.get(i + 1)
.charAt(0, caseInSensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
.charAt(j, caseInSensitive) != ' ' && sequences.get(i + 1)
.charAt(0, caseInSensitive) != ' ') {
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
}
@ -72,14 +95,14 @@ public class SearchableText {
}
} else {
counter = 0;
if(!crossSequenceParts.isEmpty()){
if (!crossSequenceParts.isEmpty()) {
j--;
}
crossSequenceParts = new ArrayList<>();
partMatch = new TextPositionSequence(sequences.get(i).getPage());
}
if(j == sequences.get(i).length() -1 && counter != 0){
if (j == sequences.get(i).length() - 1 && counter != 0) {
crossSequenceParts.add(partMatch);
}
}
@ -89,18 +112,18 @@ public class SearchableText {
}
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts){
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
UUID id = UUID.randomUUID();
List<EntityPositionSequence> result = new ArrayList<>();
int currentPage = -1;
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
for (TextPositionSequence textPositionSequence :crossSequenceParts){
if(currentPage == -1){
for (TextPositionSequence textPositionSequence : crossSequenceParts) {
if (currentPage == -1) {
currentPage = textPositionSequence.getPage();
entityPositionSequence.setPageNumber(currentPage);
entityPositionSequence.getSequences().add(textPositionSequence);
} else if(currentPage == textPositionSequence.getPage()){
} else if (currentPage == textPositionSequence.getPage()) {
entityPositionSequence.getSequences().add(textPositionSequence);
} else {
result.add(entityPositionSequence);
@ -114,13 +137,14 @@ public class SearchableText {
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
@ -137,10 +161,14 @@ public class SearchableText {
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " ");
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
.replaceAll("\n", " ")
.replaceAll(" ", " ");
}
public String getAsStringWithLinebreaks(){
public String getAsStringWithLinebreaks() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;

View File

@ -60,10 +60,6 @@ public class DictionaryService {
entryColors = typeResponse.getTypes()
.stream()
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
dictionary = entryColors.keySet()
.stream()
.collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s)
.getEntries())));
hintTypes = typeResponse.getTypes()
.stream()
.filter(TypeResult::isHint)
@ -74,6 +70,7 @@ public class DictionaryService {
.filter(TypeResult::isCaseInsensitive)
.map(TypeResult::getType)
.collect(Collectors.toList());
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
}
} catch (FeignException e) {
log.warn("Got some unknown feignException", e);
@ -81,4 +78,16 @@ public class DictionaryService {
}
}
private Set<String> convertEntries(String s) {
if (caseInsensitiveTypes.contains(s)) {
return dictionaryClient.getDictionaryForType(s)
.getEntries()
.stream()
.map(String::toLowerCase)
.collect(Collectors.toSet());
}
return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries());
}
}

View File

@ -65,7 +65,11 @@ public class EntityRedactionService {
.build());
for (Entity entity : analysedSection.getEntities()) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord()));
if(dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
} else{
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedSection.getEntities());
@ -82,7 +86,11 @@ public class EntityRedactionService {
.build());
for (Entity entity : analysedRowSection.getEntities()) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord()));
if(dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
} else{
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedRowSection.getEntities());
}
@ -99,23 +107,16 @@ public class EntityRedactionService {
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
String normalizedInputString = searchableText.toString();
String inputString = searchableText.toString();
String lowercaseInputString = inputString.toLowerCase();
Set<Entity> found = new HashSet<>();
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
for (String value : entry.getValue()) {
int startIndex;
int stopIndex = 0;
do {
startIndex = normalizedInputString.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 &&
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
}
} while (startIndex > -1);
if(dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())){
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
} else {
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
}
}
@ -124,6 +125,28 @@ public class EntityRedactionService {
return found;
}
private Set<Entity> find(String inputString, Set<String> values, String type, String headline){
Set<Entity> found = new HashSet<>();
for (String value : values) {
int startIndex;
int stopIndex = 0;
do {
startIndex = inputString.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 &&
(startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) &&
(stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
}
} while (startIndex > -1);
}
return found;
}
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}

View File

@ -71,6 +71,7 @@ public class RedactionIntegrationTest {
private final Map<String, List<String>> dictionary = new HashMap<>();
private final Map<String, float[]> typeColorMap = new HashMap<>();
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
@TestConfiguration
public static class RedactionIntegrationTestConfiguration {
@ -82,7 +83,8 @@ public class RedactionIntegrationTest {
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
.newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();
@ -156,6 +158,12 @@ public class RedactionIntegrationTest {
hintTypeMap.put(NAME_CODE, false);
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
hintTypeMap.put(DEFAULT, true);
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
caseInSensitiveMap.put(ADDRESS_CODE, false);
caseInSensitiveMap.put(NAME_CODE, false);
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
caseInSensitiveMap.put(DEFAULT, true);
}
@ -166,14 +174,22 @@ public class RedactionIntegrationTest {
.map(typeColor -> TypeResult.builder()
.type(typeColor.getKey())
.color(typeColor.getValue())
.isHint(hintTypeMap.get(typeColor.getKey())).build())
.isHint(hintTypeMap.get(typeColor.getKey()))
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
.build())
.collect(Collectors.toList());
}
private DictionaryResponse getDictionaryResponse(String type) {
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build();
return DictionaryResponse.builder()
.color(typeColorMap.get(type))
.entries(dictionary.get(type))
.isHint(hintTypeMap.get(type))
.isCaseInsensitive(caseInSensitiveMap.get(type))
.build();
}

View File

@ -100,15 +100,11 @@ Pseudacris triseriata
poecilia reticulata
poultry
quail
rabbit
rabbits
rainbow trout
Rana limnocharis
rana
limnocharis
rana pipiens
rat
rats
reptile
reptiles
ricefish