diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index 35663271..9b39e5fd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence { return text.charAt(0); } + public char charAt(int index, boolean caseInSensitive) { + TextPosition textPosition = textPositionAt(index); + String text = textPosition.getUnicode(); + return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); + } + @Override public TextPositionSequence subSequence(int start, int end) { return new TextPositionSequence(textPositions.subList(start, end), page); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index a1983ff7..717fe3f3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -13,45 +13,65 @@ public class SearchableText { private List sequences = new ArrayList<>(); + public void add(TextPositionSequence textPositionSequence) { + sequences.add(textPositionSequence); } + public void addAll(List textPositionSequences) { + sequences.addAll(textPositionSequences); } - public List getSequences(String searchString) { + public List getSequences(String searchString, boolean caseInSensitive) { - char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray(); + String normalizedSearchString; + if (caseInSensitive) { + normalizedSearchString = searchString.toLowerCase(); + } else { + normalizedSearchString = searchString; + } + + char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray(); int counter = 0; - List crossSequenceParts = new ArrayList<>(); List finalMatches = new ArrayList<>(); for (int i = 0; i < sequences.size(); i++) { TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage()); for (int j = 0; j < sequences.get(i).length(); j++) { - if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' ' - || j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){ - if(j == sequences.get(i).length() -1 && counter != 0 && !partMatch.getTextPositions().isEmpty()){ + if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInSensitive) == ' ' && sequences.get(i - 1) + .charAt(sequences.get(i - 1).length() - 1, caseInSensitive) == ' ' || j > 0 && sequences.get(i) + .charAt(j, caseInSensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInSensitive) == ' ') { + if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) { crossSequenceParts.add(partMatch); } continue; } - if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){ + if (j == 0 && sequences.get(i).charAt(j, caseInSensitive) != ' ' && i != 0 && sequences.get(i - 1) + .charAt(sequences.get(i - 1) + .length() - 1, caseInSensitive) != ' ' && searchChars[counter] == ' ') { counter++; } - if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') { + if (sequences.get(i) + .charAt(j, caseInSensitive) == searchChars[counter] || counter != 0 && sequences.get(i) + .charAt(j, caseInSensitive) == '-') { - if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1)) - || j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') { + if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i) + .charAt(j - 1, caseInSensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1) + .charAt(sequences.get(i - 1) + .length() - 1, caseInSensitive)) || j == 0 && i != 0 && sequences.get(i - 1) + .charAt(sequences.get(i - 1).length() - 1, caseInSensitive) != ' ' && sequences.get(i) + .charAt(j, caseInSensitive) != ' ') { partMatch.add(sequences.get(i).textPositionAt(j)); - if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) { + if (!(j == sequences.get(i).length() - 1 && sequences.get(i) + .charAt(j, caseInSensitive) == '-' && searchChars[counter] != '-')) { counter++; } } @@ -59,10 +79,13 @@ public class SearchableText { if (counter == searchString.length()) { crossSequenceParts.add(partMatch); - if(i == sequences.size() - 1 && j == sequences.get(i).length() -1 - || j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1)) - || j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0)) - || j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') { + if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i) + .length() - 1 && isSeparator(sequences.get(i) + .charAt(j + 1, caseInSensitive)) || j == sequences.get(i) + .length() - 1 && isSeparator(sequences.get(i + 1) + .charAt(0, caseInSensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i) + .charAt(j, caseInSensitive) != ' ' && sequences.get(i + 1) + .charAt(0, caseInSensitive) != ' ') { finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts)); } @@ -72,14 +95,14 @@ public class SearchableText { } } else { counter = 0; - if(!crossSequenceParts.isEmpty()){ + if (!crossSequenceParts.isEmpty()) { j--; } crossSequenceParts = new ArrayList<>(); partMatch = new TextPositionSequence(sequences.get(i).getPage()); } - if(j == sequences.get(i).length() -1 && counter != 0){ + if (j == sequences.get(i).length() - 1 && counter != 0) { crossSequenceParts.add(partMatch); } } @@ -89,18 +112,18 @@ public class SearchableText { } - private List buildEntityPositionSequence(List crossSequenceParts){ + private List buildEntityPositionSequence(List crossSequenceParts) { UUID id = UUID.randomUUID(); List result = new ArrayList<>(); int currentPage = -1; EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id); - for (TextPositionSequence textPositionSequence :crossSequenceParts){ - if(currentPage == -1){ + for (TextPositionSequence textPositionSequence : crossSequenceParts) { + if (currentPage == -1) { currentPage = textPositionSequence.getPage(); entityPositionSequence.setPageNumber(currentPage); entityPositionSequence.getSequences().add(textPositionSequence); - } else if(currentPage == textPositionSequence.getPage()){ + } else if (currentPage == textPositionSequence.getPage()) { entityPositionSequence.getSequences().add(textPositionSequence); } else { result.add(entityPositionSequence); @@ -114,13 +137,14 @@ public class SearchableText { private boolean isSeparator(char c) { + return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; } - @Override public String toString() { + StringBuilder sb = new StringBuilder(); TextPositionSequence previous = null; @@ -137,10 +161,14 @@ public class SearchableText { previous = word; } - return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " "); + return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()) + .replaceAll("\n", " ") + .replaceAll(" ", " "); } - public String getAsStringWithLinebreaks(){ + + public String getAsStringWithLinebreaks() { + StringBuilder sb = new StringBuilder(); TextPositionSequence previous = null; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java index d245c5d0..bff060b9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java @@ -60,10 +60,6 @@ public class DictionaryService { entryColors = typeResponse.getTypes() .stream() .collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor)); - dictionary = entryColors.keySet() - .stream() - .collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s) - .getEntries()))); hintTypes = typeResponse.getTypes() .stream() .filter(TypeResult::isHint) @@ -74,6 +70,7 @@ public class DictionaryService { .filter(TypeResult::isCaseInsensitive) .map(TypeResult::getType) .collect(Collectors.toList()); + dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s))); } } catch (FeignException e) { log.warn("Got some unknown feignException", e); @@ -81,4 +78,16 @@ public class DictionaryService { } } + + private Set convertEntries(String s) { + if (caseInsensitiveTypes.contains(s)) { + return dictionaryClient.getDictionaryForType(s) + .getEntries() + .stream() + .map(String::toLowerCase) + .collect(Collectors.toSet()); + } + return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries()); + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index a5f3a802..10f9b851 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -65,7 +65,11 @@ public class EntityRedactionService { .build()); for (Entity entity : analysedSection.getEntities()) { - entity.setPositionSequences(searchableText.getSequences(entity.getWord())); + if(dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) { + entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true)); + } else{ + entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false)); + } } documentEntities.addAll(analysedSection.getEntities()); @@ -82,7 +86,11 @@ public class EntityRedactionService { .build()); for (Entity entity : analysedRowSection.getEntities()) { - entity.setPositionSequences(searchableRow.getSequences(entity.getWord())); + if(dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) { + entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true)); + } else{ + entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false)); + } } documentEntities.addAll(analysedRowSection.getEntities()); } @@ -99,23 +107,16 @@ public class EntityRedactionService { private Set findEntities(SearchableText searchableText, String headline) { - String normalizedInputString = searchableText.toString(); + String inputString = searchableText.toString(); + String lowercaseInputString = inputString.toLowerCase(); Set found = new HashSet<>(); for (Map.Entry> entry : dictionaryService.getDictionary().entrySet()) { - for (String value : entry.getValue()) { - int startIndex; - int stopIndex = 0; - do { - startIndex = normalizedInputString.indexOf(value, stopIndex); - stopIndex = startIndex + value.length(); - if (startIndex > -1 && - (startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) && - (stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) { - found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline)); - } - } while (startIndex > -1); + if(dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())){ + found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline)); + } else { + found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline)); } } @@ -124,6 +125,28 @@ public class EntityRedactionService { return found; } + + private Set find(String inputString, Set values, String type, String headline){ + Set found = new HashSet<>(); + for (String value : values) { + int startIndex; + int stopIndex = 0; + do { + startIndex = inputString.indexOf(value, stopIndex); + stopIndex = startIndex + value.length(); + + if (startIndex > -1 && + (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && + (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline)); + } + } while (startIndex > -1); + } + return found; + } + + + private boolean isSeparator(char c) { return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 2446864d..8aff773a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -71,6 +71,7 @@ public class RedactionIntegrationTest { private final Map> dictionary = new HashMap<>(); private final Map typeColorMap = new HashMap<>(); private final Map hintTypeMap = new HashMap<>(); + private final Map caseInSensitiveMap = new HashMap<>(); @TestConfiguration public static class RedactionIntegrationTestConfiguration { @@ -82,7 +83,8 @@ public class RedactionIntegrationTest { KieFileSystem kieFileSystem = kieServices.newKieFileSystem(); InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8)); - kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input)); + kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources() + .newInputStreamResource(input)); KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem); kieBuilder.buildAll(); KieModule kieModule = kieBuilder.getKieModule(); @@ -156,6 +158,12 @@ public class RedactionIntegrationTest { hintTypeMap.put(NAME_CODE, false); hintTypeMap.put(NO_REDACTION_INDICATOR, true); hintTypeMap.put(DEFAULT, true); + + caseInSensitiveMap.put(VERTEBRATES_CODE, true); + caseInSensitiveMap.put(ADDRESS_CODE, false); + caseInSensitiveMap.put(NAME_CODE, false); + caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true); + caseInSensitiveMap.put(DEFAULT, true); } @@ -166,14 +174,22 @@ public class RedactionIntegrationTest { .map(typeColor -> TypeResult.builder() .type(typeColor.getKey()) .color(typeColor.getValue()) - .isHint(hintTypeMap.get(typeColor.getKey())).build()) + .isHint(hintTypeMap.get(typeColor.getKey())) + .isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey())) + .build()) + .collect(Collectors.toList()); } private DictionaryResponse getDictionaryResponse(String type) { - return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build(); + return DictionaryResponse.builder() + .color(typeColorMap.get(type)) + .entries(dictionary.get(type)) + .isHint(hintTypeMap.get(type)) + .isCaseInsensitive(caseInSensitiveMap.get(type)) + .build(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/NoRedactionIndicator.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/NoRedactionIndicator.txt index d53218c5..def512e4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/NoRedactionIndicator.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/NoRedactionIndicator.txt @@ -1,3 +1 @@ -In Vitro -In vitro -in vitro \ No newline at end of file +In Vitro \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/vertebrates.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/vertebrates.txt index 4b394818..6337a8b0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/vertebrates.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/vertebrates.txt @@ -100,15 +100,11 @@ Pseudacris triseriata poecilia reticulata poultry quail -rabbit -rabbits rainbow trout Rana limnocharis rana limnocharis rana pipiens -rat -rats reptile reptiles ricefish