Pull request #14: RED-207: Match caseInsensitive dictionaries caseInSensitive
Merge in RED/redaction-service from RED-207 to master * commit '135a715e22e6c2536b268db29161552cfd7a6c1c': Fixed style in EnityRedactionService Fixed wrong naming of caseInsensitive RED-207: Match caseInsensitive dictionaries caseInSensitive
This commit is contained in:
commit
b7ee62f44d
@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence {
|
||||
return text.charAt(0);
|
||||
}
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||
|
||||
@ -13,45 +13,65 @@ public class SearchableText {
|
||||
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
|
||||
public void add(TextPositionSequence textPositionSequence) {
|
||||
|
||||
sequences.add(textPositionSequence);
|
||||
}
|
||||
|
||||
|
||||
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
sequences.addAll(textPositionSequences);
|
||||
}
|
||||
|
||||
|
||||
public List<EntityPositionSequence> getSequences(String searchString) {
|
||||
public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive) {
|
||||
|
||||
char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray();
|
||||
String normalizedSearchString;
|
||||
if (caseInsensitive) {
|
||||
normalizedSearchString = searchString.toLowerCase();
|
||||
} else {
|
||||
normalizedSearchString = searchString;
|
||||
}
|
||||
|
||||
char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray();
|
||||
int counter = 0;
|
||||
|
||||
|
||||
List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
|
||||
List<EntityPositionSequence> finalMatches = new ArrayList<>();
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
||||
for (int j = 0; j < sequences.get(i).length(); j++) {
|
||||
|
||||
if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' '
|
||||
|| j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){
|
||||
if(j == sequences.get(i).length() -1 && counter != 0 && !partMatch.getTextPositions().isEmpty()){
|
||||
if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i)
|
||||
.charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') {
|
||||
if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
|
||||
crossSequenceParts.add(partMatch);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){
|
||||
if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1)
|
||||
.length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') {
|
||||
counter++;
|
||||
}
|
||||
|
||||
if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') {
|
||||
if (sequences.get(i)
|
||||
.charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
|
||||
.charAt(j, caseInsensitive) == '-') {
|
||||
|
||||
if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1))
|
||||
|| j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') {
|
||||
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
|
||||
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1)
|
||||
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
|
||||
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i)
|
||||
.charAt(j, caseInsensitive) != ' ') {
|
||||
partMatch.add(sequences.get(i).textPositionAt(j));
|
||||
if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) {
|
||||
if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
|
||||
.charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) {
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
@ -59,10 +79,13 @@ public class SearchableText {
|
||||
if (counter == searchString.length()) {
|
||||
crossSequenceParts.add(partMatch);
|
||||
|
||||
if(i == sequences.size() - 1 && j == sequences.get(i).length() -1
|
||||
|| j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1))
|
||||
|| j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0))
|
||||
|| j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') {
|
||||
if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
|
||||
.length() - 1 && isSeparator(sequences.get(i)
|
||||
.charAt(j + 1, caseInsensitive)) || j == sequences.get(i)
|
||||
.length() - 1 && isSeparator(sequences.get(i + 1)
|
||||
.charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
|
||||
.charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1)
|
||||
.charAt(0, caseInsensitive) != ' ') {
|
||||
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
|
||||
}
|
||||
|
||||
@ -72,14 +95,14 @@ public class SearchableText {
|
||||
}
|
||||
} else {
|
||||
counter = 0;
|
||||
if(!crossSequenceParts.isEmpty()){
|
||||
if (!crossSequenceParts.isEmpty()) {
|
||||
j--;
|
||||
}
|
||||
crossSequenceParts = new ArrayList<>();
|
||||
partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
||||
}
|
||||
|
||||
if(j == sequences.get(i).length() -1 && counter != 0){
|
||||
if (j == sequences.get(i).length() - 1 && counter != 0) {
|
||||
crossSequenceParts.add(partMatch);
|
||||
}
|
||||
}
|
||||
@ -89,18 +112,18 @@ public class SearchableText {
|
||||
}
|
||||
|
||||
|
||||
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts){
|
||||
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
|
||||
|
||||
UUID id = UUID.randomUUID();
|
||||
List<EntityPositionSequence> result = new ArrayList<>();
|
||||
int currentPage = -1;
|
||||
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
|
||||
for (TextPositionSequence textPositionSequence :crossSequenceParts){
|
||||
if(currentPage == -1){
|
||||
for (TextPositionSequence textPositionSequence : crossSequenceParts) {
|
||||
if (currentPage == -1) {
|
||||
currentPage = textPositionSequence.getPage();
|
||||
entityPositionSequence.setPageNumber(currentPage);
|
||||
entityPositionSequence.getSequences().add(textPositionSequence);
|
||||
} else if(currentPage == textPositionSequence.getPage()){
|
||||
} else if (currentPage == textPositionSequence.getPage()) {
|
||||
entityPositionSequence.getSequences().add(textPositionSequence);
|
||||
} else {
|
||||
result.add(entityPositionSequence);
|
||||
@ -114,13 +137,14 @@ public class SearchableText {
|
||||
|
||||
|
||||
private boolean isSeparator(char c) {
|
||||
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
@ -137,10 +161,14 @@ public class SearchableText {
|
||||
previous = word;
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " ");
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
|
||||
.replaceAll("\n", " ")
|
||||
.replaceAll(" ", " ");
|
||||
}
|
||||
|
||||
public String getAsStringWithLinebreaks(){
|
||||
|
||||
public String getAsStringWithLinebreaks() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
|
||||
@ -60,10 +60,6 @@ public class DictionaryService {
|
||||
entryColors = typeResponse.getTypes()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
|
||||
dictionary = entryColors.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s)
|
||||
.getEntries())));
|
||||
hintTypes = typeResponse.getTypes()
|
||||
.stream()
|
||||
.filter(TypeResult::isHint)
|
||||
@ -74,6 +70,7 @@ public class DictionaryService {
|
||||
.filter(TypeResult::isCaseInsensitive)
|
||||
.map(TypeResult::getType)
|
||||
.collect(Collectors.toList());
|
||||
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
|
||||
}
|
||||
} catch (FeignException e) {
|
||||
log.warn("Got some unknown feignException", e);
|
||||
@ -81,4 +78,16 @@ public class DictionaryService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Set<String> convertEntries(String s) {
|
||||
if (caseInsensitiveTypes.contains(s)) {
|
||||
return dictionaryClient.getDictionaryForType(s)
|
||||
.getEntries()
|
||||
.stream()
|
||||
.map(String::toLowerCase)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries());
|
||||
}
|
||||
|
||||
}
|
||||
@ -27,6 +27,7 @@ public class EntityRedactionService {
|
||||
private final DictionaryService dictionaryService;
|
||||
private final DroolsExecutionService droolsExecutionService;
|
||||
|
||||
|
||||
public void processDocument(Document classifiedDoc) {
|
||||
|
||||
dictionaryService.updateDictionary();
|
||||
@ -56,8 +57,7 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
|
||||
Section analysedSection = droolsExecutionService.executeRules(Section
|
||||
.builder()
|
||||
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
|
||||
.entities(entities)
|
||||
.text(searchableText.getAsStringWithLinebreaks())
|
||||
.searchText(searchableText.toString())
|
||||
@ -65,7 +65,11 @@ public class EntityRedactionService {
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedSection.getEntities()) {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord()));
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
|
||||
} else {
|
||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
|
||||
documentEntities.addAll(analysedSection.getEntities());
|
||||
@ -73,8 +77,7 @@ public class EntityRedactionService {
|
||||
for (SearchableText searchableRow : searchableRows) {
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
|
||||
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(Section
|
||||
.builder()
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
|
||||
.entities(rowEntities)
|
||||
.text(searchableRow.getAsStringWithLinebreaks())
|
||||
.searchText(searchableRow.toString())
|
||||
@ -82,7 +85,11 @@ public class EntityRedactionService {
|
||||
.build());
|
||||
|
||||
for (Entity entity : analysedRowSection.getEntities()) {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord()));
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
|
||||
} else {
|
||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
|
||||
}
|
||||
}
|
||||
documentEntities.addAll(analysedRowSection.getEntities());
|
||||
}
|
||||
@ -90,32 +97,27 @@ public class EntityRedactionService {
|
||||
|
||||
documentEntities.forEach(entity -> {
|
||||
entity.getPositionSequences().forEach(sequence -> {
|
||||
classifiedDoc.getEntities().computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>()).add(
|
||||
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence), entity.getHeadline(), entity.getMatchedRule())
|
||||
);
|
||||
classifiedDoc.getEntities()
|
||||
.computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List
|
||||
.of(sequence), entity.getHeadline(), entity.getMatchedRule()));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
||||
|
||||
String normalizedInputString = searchableText.toString();
|
||||
String inputString = searchableText.toString();
|
||||
String lowercaseInputString = inputString.toLowerCase();
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
|
||||
for (String value : entry.getValue()) {
|
||||
int startIndex;
|
||||
int stopIndex = 0;
|
||||
do {
|
||||
startIndex = normalizedInputString.indexOf(value, stopIndex);
|
||||
stopIndex = startIndex + value.length();
|
||||
|
||||
if (startIndex > -1 &&
|
||||
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
|
||||
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
|
||||
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
|
||||
} else {
|
||||
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
|
||||
}
|
||||
}
|
||||
|
||||
@ -124,19 +126,45 @@ public class EntityRedactionService {
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> find(String inputString, Set<String> values, String type, String headline) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
for (String value : values) {
|
||||
int startIndex;
|
||||
int stopIndex = 0;
|
||||
do {
|
||||
startIndex = inputString.indexOf(value, stopIndex);
|
||||
stopIndex = startIndex + value.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSeparator(char c) {
|
||||
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord().length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -71,6 +71,7 @@ public class RedactionIntegrationTest {
|
||||
private final Map<String, List<String>> dictionary = new HashMap<>();
|
||||
private final Map<String, float[]> typeColorMap = new HashMap<>();
|
||||
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
||||
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
|
||||
|
||||
@TestConfiguration
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
@ -82,7 +83,8 @@ public class RedactionIntegrationTest {
|
||||
|
||||
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
|
||||
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
|
||||
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
|
||||
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
|
||||
.newInputStreamResource(input));
|
||||
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
|
||||
kieBuilder.buildAll();
|
||||
KieModule kieModule = kieBuilder.getKieModule();
|
||||
@ -156,6 +158,12 @@ public class RedactionIntegrationTest {
|
||||
hintTypeMap.put(NAME_CODE, false);
|
||||
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
|
||||
hintTypeMap.put(DEFAULT, true);
|
||||
|
||||
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
|
||||
caseInSensitiveMap.put(ADDRESS_CODE, false);
|
||||
caseInSensitiveMap.put(NAME_CODE, false);
|
||||
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
|
||||
caseInSensitiveMap.put(DEFAULT, true);
|
||||
}
|
||||
|
||||
|
||||
@ -166,14 +174,22 @@ public class RedactionIntegrationTest {
|
||||
.map(typeColor -> TypeResult.builder()
|
||||
.type(typeColor.getKey())
|
||||
.color(typeColor.getValue())
|
||||
.isHint(hintTypeMap.get(typeColor.getKey())).build())
|
||||
.isHint(hintTypeMap.get(typeColor.getKey()))
|
||||
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
|
||||
.build())
|
||||
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private DictionaryResponse getDictionaryResponse(String type) {
|
||||
|
||||
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build();
|
||||
return DictionaryResponse.builder()
|
||||
.color(typeColorMap.get(type))
|
||||
.entries(dictionary.get(type))
|
||||
.isHint(hintTypeMap.get(type))
|
||||
.isCaseInsensitive(caseInSensitiveMap.get(type))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,3 +1 @@
|
||||
In Vitro
|
||||
In vitro
|
||||
in vitro
|
||||
@ -100,15 +100,11 @@ Pseudacris triseriata
|
||||
poecilia reticulata
|
||||
poultry
|
||||
quail
|
||||
rabbit
|
||||
rabbits
|
||||
rainbow trout
|
||||
Rana limnocharis
|
||||
rana
|
||||
limnocharis
|
||||
rana pipiens
|
||||
rat
|
||||
rats
|
||||
reptile
|
||||
reptiles
|
||||
ricefish
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user