RED-207: Match caseInsensitive dictionaries caseInSensitive
This commit is contained in:
parent
d282680cc8
commit
f0e48087ff
@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence {
|
||||
return text.charAt(0);
|
||||
}
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||
|
||||
@ -13,45 +13,65 @@ public class SearchableText {
|
||||
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
|
||||
public void add(TextPositionSequence textPositionSequence) {
|
||||
|
||||
sequences.add(textPositionSequence);
|
||||
}
|
||||
|
||||
|
||||
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
sequences.addAll(textPositionSequences);
|
||||
}
|
||||
|
||||
|
||||
public List<EntityPositionSequence> getSequences(String searchString) {
|
||||
public List<EntityPositionSequence> getSequences(String searchString, boolean caseInSensitive) {
|
||||
|
||||
char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray();
|
||||
String normalizedSearchString;
|
||||
if (caseInSensitive) {
|
||||
normalizedSearchString = searchString.toLowerCase();
|
||||
} else {
|
||||
normalizedSearchString = searchString;
|
||||
}
|
||||
|
||||
char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray();
|
||||
int counter = 0;
|
||||
|
||||
|
||||
List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
|
||||
List<EntityPositionSequence> finalMatches = new ArrayList<>();
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
||||
for (int j = 0; j < sequences.get(i).length(); j++) {
|
||||
|
||||
if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' '
|
||||
|| j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){
|
||||
if(j == sequences.get(i).length() -1 && counter != 0 && !partMatch.getTextPositions().isEmpty()){
|
||||
if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInSensitive) == ' ' && sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1).length() - 1, caseInSensitive) == ' ' || j > 0 && sequences.get(i)
|
||||
.charAt(j, caseInSensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInSensitive) == ' ') {
|
||||
if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
|
||||
crossSequenceParts.add(partMatch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){
|
||||
if (j == 0 && sequences.get(i).charAt(j, caseInSensitive) != ' ' && i != 0 && sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1)
|
||||
.length() - 1, caseInSensitive) != ' ' && searchChars[counter] == ' ') {
|
||||
counter++;
|
||||
}
|
||||
|
||||
if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') {
|
||||
if (sequences.get(i)
|
||||
.charAt(j, caseInSensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
|
||||
.charAt(j, caseInSensitive) == '-') {
|
||||
|
||||
if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1))
|
||||
|| j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') {
|
||||
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
|
||||
.charAt(j - 1, caseInSensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1)
|
||||
.length() - 1, caseInSensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1).length() - 1, caseInSensitive) != ' ' && sequences.get(i)
|
||||
.charAt(j, caseInSensitive) != ' ') {
|
||||
partMatch.add(sequences.get(i).textPositionAt(j));
|
||||
if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) {
|
||||
if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
|
||||
.charAt(j, caseInSensitive) == '-' && searchChars[counter] != '-')) {
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
@ -59,10 +79,13 @@ public class SearchableText {
|
||||
if (counter == searchString.length()) {
|
||||
crossSequenceParts.add(partMatch);
|
||||
|
||||
if(i == sequences.size() - 1 && j == sequences.get(i).length() -1
|
||||
|| j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1))
|
||||
|| j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0))
|
||||
|| j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') {
|
||||
if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
|
||||
.length() - 1 && isSeparator(sequences.get(i)
|
||||
.charAt(j + 1, caseInSensitive)) || j == sequences.get(i)
|
||||
.length() - 1 && isSeparator(sequences.get(i + 1)
|
||||
.charAt(0, caseInSensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
|
||||
.charAt(j, caseInSensitive) != ' ' && sequences.get(i + 1)
|
||||
.charAt(0, caseInSensitive) != ' ') {
|
||||
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
|
||||
}
|
||||
|
||||
@ -72,14 +95,14 @@ public class SearchableText {
|
||||
}
|
||||
} else {
|
||||
counter = 0;
|
||||
if(!crossSequenceParts.isEmpty()){
|
||||
if (!crossSequenceParts.isEmpty()) {
|
||||
j--;
|
||||
}
|
||||
crossSequenceParts = new ArrayList<>();
|
||||
partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
||||
}
|
||||
|
||||
if(j == sequences.get(i).length() -1 && counter != 0){
|
||||
if (j == sequences.get(i).length() - 1 && counter != 0) {
|
||||
crossSequenceParts.add(partMatch);
|
||||
}
|
||||
}
|
||||
@ -89,18 +112,18 @@ public class SearchableText {
|
||||
}
|
||||
|
||||
|
||||
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts){
|
||||
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
|
||||
|
||||
UUID id = UUID.randomUUID();
|
||||
List<EntityPositionSequence> result = new ArrayList<>();
|
||||
int currentPage = -1;
|
||||
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
|
||||
for (TextPositionSequence textPositionSequence :crossSequenceParts){
|
||||
if(currentPage == -1){
|
||||
for (TextPositionSequence textPositionSequence : crossSequenceParts) {
|
||||
if (currentPage == -1) {
|
||||
currentPage = textPositionSequence.getPage();
|
||||
entityPositionSequence.setPageNumber(currentPage);
|
||||
entityPositionSequence.getSequences().add(textPositionSequence);
|
||||
} else if(currentPage == textPositionSequence.getPage()){
|
||||
} else if (currentPage == textPositionSequence.getPage()) {
|
||||
entityPositionSequence.getSequences().add(textPositionSequence);
|
||||
} else {
|
||||
result.add(entityPositionSequence);
|
||||
@ -114,13 +137,14 @@ public class SearchableText {
|
||||
|
||||
|
||||
private boolean isSeparator(char c) {
|
||||
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
@ -137,10 +161,14 @@ public class SearchableText {
|
||||
previous = word;
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " ");
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
|
||||
.replaceAll("\n", " ")
|
||||
.replaceAll(" ", " ");
|
||||
}
|
||||
|
||||
public String getAsStringWithLinebreaks(){
|
||||
|
||||
public String getAsStringWithLinebreaks() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
|
||||
@ -60,10 +60,6 @@ public class DictionaryService {
|
||||
entryColors = typeResponse.getTypes()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
|
||||
dictionary = entryColors.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s)
|
||||
.getEntries())));
|
||||
hintTypes = typeResponse.getTypes()
|
||||
.stream()
|
||||
.filter(TypeResult::isHint)
|
||||
@ -74,6 +70,7 @@ public class DictionaryService {
|
||||
.filter(TypeResult::isCaseInsensitive)
|
||||
.map(TypeResult::getType)
|
||||
.collect(Collectors.toList());
|
||||
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
|
||||
}
|
||||
} catch (FeignException e) {
|
||||
log.warn("Got some unknown feignException", e);
|
||||
@ -81,4 +78,16 @@ public class DictionaryService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Set<String> convertEntries(String s) {
|
||||
if (caseInsensitiveTypes.contains(s)) {
|
||||
return dictionaryClient.getDictionaryForType(s)
|
||||
.getEntries()
|
||||
.stream()
|
||||
.map(String::toLowerCase)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries());
|
||||
}
|
||||
|
||||
}
|
||||
@ -65,7 +65,11 @@ public class EntityRedactionService {
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedSection.getEntities()) {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord()));
|
||||
if(dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
|
||||
} else{
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
|
||||
documentEntities.addAll(analysedSection.getEntities());
|
||||
@ -82,7 +86,11 @@ public class EntityRedactionService {
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedRowSection.getEntities()) {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord()));
|
||||
if(dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
|
||||
} else{
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
documentEntities.addAll(analysedRowSection.getEntities());
|
||||
}
|
||||
@ -99,23 +107,16 @@ public class EntityRedactionService {
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
||||
|
||||
String normalizedInputString = searchableText.toString();
|
||||
String inputString = searchableText.toString();
|
||||
String lowercaseInputString = inputString.toLowerCase();
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
|
||||
for (String value : entry.getValue()) {
|
||||
int startIndex;
|
||||
int stopIndex = 0;
|
||||
do {
|
||||
startIndex = normalizedInputString.indexOf(value, stopIndex);
|
||||
stopIndex = startIndex + value.length();
|
||||
|
||||
if (startIndex > -1 &&
|
||||
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
|
||||
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
if(dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())){
|
||||
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
|
||||
} else {
|
||||
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
|
||||
}
|
||||
}
|
||||
|
||||
@ -124,6 +125,28 @@ public class EntityRedactionService {
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> find(String inputString, Set<String> values, String type, String headline){
|
||||
Set<Entity> found = new HashSet<>();
|
||||
for (String value : values) {
|
||||
int startIndex;
|
||||
int stopIndex = 0;
|
||||
do {
|
||||
startIndex = inputString.indexOf(value, stopIndex);
|
||||
stopIndex = startIndex + value.length();
|
||||
|
||||
if (startIndex > -1 &&
|
||||
(startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) &&
|
||||
(stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private boolean isSeparator(char c) {
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
@ -71,6 +71,7 @@ public class RedactionIntegrationTest {
|
||||
private final Map<String, List<String>> dictionary = new HashMap<>();
|
||||
private final Map<String, float[]> typeColorMap = new HashMap<>();
|
||||
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
||||
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
|
||||
|
||||
@TestConfiguration
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
@ -82,7 +83,8 @@ public class RedactionIntegrationTest {
|
||||
|
||||
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
|
||||
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
|
||||
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
|
||||
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
|
||||
.newInputStreamResource(input));
|
||||
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
|
||||
kieBuilder.buildAll();
|
||||
KieModule kieModule = kieBuilder.getKieModule();
|
||||
@ -156,6 +158,12 @@ public class RedactionIntegrationTest {
|
||||
hintTypeMap.put(NAME_CODE, false);
|
||||
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
|
||||
hintTypeMap.put(DEFAULT, true);
|
||||
|
||||
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
|
||||
caseInSensitiveMap.put(ADDRESS_CODE, false);
|
||||
caseInSensitiveMap.put(NAME_CODE, false);
|
||||
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
|
||||
caseInSensitiveMap.put(DEFAULT, true);
|
||||
}
|
||||
|
||||
|
||||
@ -166,14 +174,22 @@ public class RedactionIntegrationTest {
|
||||
.map(typeColor -> TypeResult.builder()
|
||||
.type(typeColor.getKey())
|
||||
.color(typeColor.getValue())
|
||||
.isHint(hintTypeMap.get(typeColor.getKey())).build())
|
||||
.isHint(hintTypeMap.get(typeColor.getKey()))
|
||||
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
|
||||
.build())
|
||||
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private DictionaryResponse getDictionaryResponse(String type) {
|
||||
|
||||
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build();
|
||||
return DictionaryResponse.builder()
|
||||
.color(typeColorMap.get(type))
|
||||
.entries(dictionary.get(type))
|
||||
.isHint(hintTypeMap.get(type))
|
||||
.isCaseInsensitive(caseInSensitiveMap.get(type))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,3 +1 @@
|
||||
In Vitro
|
||||
In vitro
|
||||
in vitro
|
||||
In Vitro
|
||||
@ -100,15 +100,11 @@ Pseudacris triseriata
|
||||
poecilia reticulata
|
||||
poultry
|
||||
quail
|
||||
rabbit
|
||||
rabbits
|
||||
rainbow trout
|
||||
Rana limnocharis
|
||||
rana
|
||||
limnocharis
|
||||
rana pipiens
|
||||
rat
|
||||
rats
|
||||
reptile
|
||||
reptiles
|
||||
ricefish
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user