Pull request #14: RED-207: Match caseInsensitive dictionaries caseInSensitive
Merge in RED/redaction-service from RED-207 to master * commit '135a715e22e6c2536b268db29161552cfd7a6c1c': Fixed style in EnityRedactionService Fixed wrong naming of caseInsensitive RED-207: Match caseInsensitive dictionaries caseInSensitive
This commit is contained in:
commit
b7ee62f44d
@ -39,6 +39,12 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
return text.charAt(0);
|
return text.charAt(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public char charAt(int index, boolean caseInSensitive) {
|
||||||
|
TextPosition textPosition = textPositionAt(index);
|
||||||
|
String text = textPosition.getUnicode();
|
||||||
|
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TextPositionSequence subSequence(int start, int end) {
|
public TextPositionSequence subSequence(int start, int end) {
|
||||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||||
|
|||||||
@ -13,45 +13,65 @@ public class SearchableText {
|
|||||||
|
|
||||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPositionSequence textPositionSequence) {
|
public void add(TextPositionSequence textPositionSequence) {
|
||||||
|
|
||||||
sequences.add(textPositionSequence);
|
sequences.add(textPositionSequence);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
||||||
|
|
||||||
sequences.addAll(textPositionSequences);
|
sequences.addAll(textPositionSequences);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<EntityPositionSequence> getSequences(String searchString) {
|
public List<EntityPositionSequence> getSequences(String searchString, boolean caseInsensitive) {
|
||||||
|
|
||||||
char[] searchChars = searchString.replaceAll("\\n", " ").toCharArray();
|
String normalizedSearchString;
|
||||||
|
if (caseInsensitive) {
|
||||||
|
normalizedSearchString = searchString.toLowerCase();
|
||||||
|
} else {
|
||||||
|
normalizedSearchString = searchString;
|
||||||
|
}
|
||||||
|
|
||||||
|
char[] searchChars = normalizedSearchString.replaceAll("\\n", " ").toCharArray();
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
|
|
||||||
|
|
||||||
List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
|
List<TextPositionSequence> crossSequenceParts = new ArrayList<>();
|
||||||
List<EntityPositionSequence> finalMatches = new ArrayList<>();
|
List<EntityPositionSequence> finalMatches = new ArrayList<>();
|
||||||
for (int i = 0; i < sequences.size(); i++) {
|
for (int i = 0; i < sequences.size(); i++) {
|
||||||
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
TextPositionSequence partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
||||||
for (int j = 0; j < sequences.get(i).length(); j++) {
|
for (int j = 0; j < sequences.get(i).length(); j++) {
|
||||||
|
|
||||||
if(i > 0 && j == 0 && sequences.get(i).charAt(0) == ' ' && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) == ' '
|
if (i > 0 && j == 0 && sequences.get(i).charAt(0, caseInsensitive) == ' ' && sequences.get(i - 1)
|
||||||
|| j > 0 && sequences.get(i).charAt(j) == ' ' && sequences.get(i).charAt(j - 1) == ' '){
|
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) == ' ' || j > 0 && sequences.get(i)
|
||||||
if(j == sequences.get(i).length() -1 && counter != 0 && !partMatch.getTextPositions().isEmpty()){
|
.charAt(j, caseInsensitive) == ' ' && sequences.get(i).charAt(j - 1, caseInsensitive) == ' ') {
|
||||||
|
if (j == sequences.get(i).length() - 1 && counter != 0 && !partMatch.getTextPositions().isEmpty()) {
|
||||||
crossSequenceParts.add(partMatch);
|
crossSequenceParts.add(partMatch);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(j == 0 && sequences.get(i).charAt(j) != ' ' && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && searchChars[counter] == ' '){
|
if (j == 0 && sequences.get(i).charAt(j, caseInsensitive) != ' ' && i != 0 && sequences.get(i - 1)
|
||||||
|
.charAt(sequences.get(i - 1)
|
||||||
|
.length() - 1, caseInsensitive) != ' ' && searchChars[counter] == ' ') {
|
||||||
counter++;
|
counter++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sequences.get(i).charAt(j) == searchChars[counter] || counter != 0 && sequences.get(i).charAt(j) == '-') {
|
if (sequences.get(i)
|
||||||
|
.charAt(j, caseInsensitive) == searchChars[counter] || counter != 0 && sequences.get(i)
|
||||||
|
.charAt(j, caseInsensitive) == '-') {
|
||||||
|
|
||||||
if(counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i).charAt(j - 1)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1))
|
if (counter != 0 || i == 0 && j == 0 || j != 0 && isSeparator(sequences.get(i)
|
||||||
|| j == 0 && i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() -1) != ' ' && sequences.get(i).charAt(j) != ' ') {
|
.charAt(j - 1, caseInsensitive)) || j == 0 && i != 0 && isSeparator(sequences.get(i - 1)
|
||||||
|
.charAt(sequences.get(i - 1)
|
||||||
|
.length() - 1, caseInsensitive)) || j == 0 && i != 0 && sequences.get(i - 1)
|
||||||
|
.charAt(sequences.get(i - 1).length() - 1, caseInsensitive) != ' ' && sequences.get(i)
|
||||||
|
.charAt(j, caseInsensitive) != ' ') {
|
||||||
partMatch.add(sequences.get(i).textPositionAt(j));
|
partMatch.add(sequences.get(i).textPositionAt(j));
|
||||||
if (!(j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) == '-' && searchChars[counter] != '-')) {
|
if (!(j == sequences.get(i).length() - 1 && sequences.get(i)
|
||||||
|
.charAt(j, caseInsensitive) == '-' && searchChars[counter] != '-')) {
|
||||||
counter++;
|
counter++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -59,10 +79,13 @@ public class SearchableText {
|
|||||||
if (counter == searchString.length()) {
|
if (counter == searchString.length()) {
|
||||||
crossSequenceParts.add(partMatch);
|
crossSequenceParts.add(partMatch);
|
||||||
|
|
||||||
if(i == sequences.size() - 1 && j == sequences.get(i).length() -1
|
if (i == sequences.size() - 1 && j == sequences.get(i).length() - 1 || j != sequences.get(i)
|
||||||
|| j != sequences.get(i).length() -1 && isSeparator(sequences.get(i).charAt(j +1))
|
.length() - 1 && isSeparator(sequences.get(i)
|
||||||
|| j == sequences.get(i).length() -1 && isSeparator(sequences.get(i + 1).charAt(0))
|
.charAt(j + 1, caseInsensitive)) || j == sequences.get(i)
|
||||||
|| j == sequences.get(i).length() -1 && sequences.get(i).charAt(j) != ' ' && sequences.get(i + 1).charAt(0) != ' ') {
|
.length() - 1 && isSeparator(sequences.get(i + 1)
|
||||||
|
.charAt(0, caseInsensitive)) || j == sequences.get(i).length() - 1 && sequences.get(i)
|
||||||
|
.charAt(j, caseInsensitive) != ' ' && sequences.get(i + 1)
|
||||||
|
.charAt(0, caseInsensitive) != ' ') {
|
||||||
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
|
finalMatches.addAll(buildEntityPositionSequence(crossSequenceParts));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,14 +95,14 @@ public class SearchableText {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
counter = 0;
|
counter = 0;
|
||||||
if(!crossSequenceParts.isEmpty()){
|
if (!crossSequenceParts.isEmpty()) {
|
||||||
j--;
|
j--;
|
||||||
}
|
}
|
||||||
crossSequenceParts = new ArrayList<>();
|
crossSequenceParts = new ArrayList<>();
|
||||||
partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
partMatch = new TextPositionSequence(sequences.get(i).getPage());
|
||||||
}
|
}
|
||||||
|
|
||||||
if(j == sequences.get(i).length() -1 && counter != 0){
|
if (j == sequences.get(i).length() - 1 && counter != 0) {
|
||||||
crossSequenceParts.add(partMatch);
|
crossSequenceParts.add(partMatch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -89,18 +112,18 @@ public class SearchableText {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts){
|
private List<EntityPositionSequence> buildEntityPositionSequence(List<TextPositionSequence> crossSequenceParts) {
|
||||||
|
|
||||||
UUID id = UUID.randomUUID();
|
UUID id = UUID.randomUUID();
|
||||||
List<EntityPositionSequence> result = new ArrayList<>();
|
List<EntityPositionSequence> result = new ArrayList<>();
|
||||||
int currentPage = -1;
|
int currentPage = -1;
|
||||||
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
|
EntityPositionSequence entityPositionSequence = new EntityPositionSequence(id);
|
||||||
for (TextPositionSequence textPositionSequence :crossSequenceParts){
|
for (TextPositionSequence textPositionSequence : crossSequenceParts) {
|
||||||
if(currentPage == -1){
|
if (currentPage == -1) {
|
||||||
currentPage = textPositionSequence.getPage();
|
currentPage = textPositionSequence.getPage();
|
||||||
entityPositionSequence.setPageNumber(currentPage);
|
entityPositionSequence.setPageNumber(currentPage);
|
||||||
entityPositionSequence.getSequences().add(textPositionSequence);
|
entityPositionSequence.getSequences().add(textPositionSequence);
|
||||||
} else if(currentPage == textPositionSequence.getPage()){
|
} else if (currentPage == textPositionSequence.getPage()) {
|
||||||
entityPositionSequence.getSequences().add(textPositionSequence);
|
entityPositionSequence.getSequences().add(textPositionSequence);
|
||||||
} else {
|
} else {
|
||||||
result.add(entityPositionSequence);
|
result.add(entityPositionSequence);
|
||||||
@ -114,13 +137,14 @@ public class SearchableText {
|
|||||||
|
|
||||||
|
|
||||||
private boolean isSeparator(char c) {
|
private boolean isSeparator(char c) {
|
||||||
|
|
||||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
TextPositionSequence previous = null;
|
TextPositionSequence previous = null;
|
||||||
@ -137,10 +161,14 @@ public class SearchableText {
|
|||||||
previous = word;
|
previous = word;
|
||||||
}
|
}
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" ", " ");
|
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString())
|
||||||
|
.replaceAll("\n", " ")
|
||||||
|
.replaceAll(" ", " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getAsStringWithLinebreaks(){
|
|
||||||
|
public String getAsStringWithLinebreaks() {
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
TextPositionSequence previous = null;
|
TextPositionSequence previous = null;
|
||||||
|
|||||||
@ -60,10 +60,6 @@ public class DictionaryService {
|
|||||||
entryColors = typeResponse.getTypes()
|
entryColors = typeResponse.getTypes()
|
||||||
.stream()
|
.stream()
|
||||||
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
|
.collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
|
||||||
dictionary = entryColors.keySet()
|
|
||||||
.stream()
|
|
||||||
.collect(Collectors.toMap(type -> type, s -> new HashSet<>(dictionaryClient.getDictionaryForType(s)
|
|
||||||
.getEntries())));
|
|
||||||
hintTypes = typeResponse.getTypes()
|
hintTypes = typeResponse.getTypes()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(TypeResult::isHint)
|
.filter(TypeResult::isHint)
|
||||||
@ -74,6 +70,7 @@ public class DictionaryService {
|
|||||||
.filter(TypeResult::isCaseInsensitive)
|
.filter(TypeResult::isCaseInsensitive)
|
||||||
.map(TypeResult::getType)
|
.map(TypeResult::getType)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
|
||||||
}
|
}
|
||||||
} catch (FeignException e) {
|
} catch (FeignException e) {
|
||||||
log.warn("Got some unknown feignException", e);
|
log.warn("Got some unknown feignException", e);
|
||||||
@ -81,4 +78,16 @@ public class DictionaryService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Set<String> convertEntries(String s) {
|
||||||
|
if (caseInsensitiveTypes.contains(s)) {
|
||||||
|
return dictionaryClient.getDictionaryForType(s)
|
||||||
|
.getEntries()
|
||||||
|
.stream()
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
return new HashSet<>(dictionaryClient.getDictionaryForType(s).getEntries());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -27,6 +27,7 @@ public class EntityRedactionService {
|
|||||||
private final DictionaryService dictionaryService;
|
private final DictionaryService dictionaryService;
|
||||||
private final DroolsExecutionService droolsExecutionService;
|
private final DroolsExecutionService droolsExecutionService;
|
||||||
|
|
||||||
|
|
||||||
public void processDocument(Document classifiedDoc) {
|
public void processDocument(Document classifiedDoc) {
|
||||||
|
|
||||||
dictionaryService.updateDictionary();
|
dictionaryService.updateDictionary();
|
||||||
@ -56,8 +57,7 @@ public class EntityRedactionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
|
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
|
||||||
Section analysedSection = droolsExecutionService.executeRules(Section
|
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
|
||||||
.builder()
|
|
||||||
.entities(entities)
|
.entities(entities)
|
||||||
.text(searchableText.getAsStringWithLinebreaks())
|
.text(searchableText.getAsStringWithLinebreaks())
|
||||||
.searchText(searchableText.toString())
|
.searchText(searchableText.toString())
|
||||||
@ -65,7 +65,11 @@ public class EntityRedactionService {
|
|||||||
.build());
|
.build());
|
||||||
|
|
||||||
for (Entity entity : analysedSection.getEntities()) {
|
for (Entity entity : analysedSection.getEntities()) {
|
||||||
entity.setPositionSequences(searchableText.getSequences(entity.getWord()));
|
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||||
|
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
|
||||||
|
} else {
|
||||||
|
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
documentEntities.addAll(analysedSection.getEntities());
|
documentEntities.addAll(analysedSection.getEntities());
|
||||||
@ -73,8 +77,7 @@ public class EntityRedactionService {
|
|||||||
for (SearchableText searchableRow : searchableRows) {
|
for (SearchableText searchableRow : searchableRows) {
|
||||||
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
|
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
|
||||||
|
|
||||||
Section analysedRowSection = droolsExecutionService.executeRules(Section
|
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
|
||||||
.builder()
|
|
||||||
.entities(rowEntities)
|
.entities(rowEntities)
|
||||||
.text(searchableRow.getAsStringWithLinebreaks())
|
.text(searchableRow.getAsStringWithLinebreaks())
|
||||||
.searchText(searchableRow.toString())
|
.searchText(searchableRow.toString())
|
||||||
@ -82,7 +85,11 @@ public class EntityRedactionService {
|
|||||||
.build());
|
.build());
|
||||||
|
|
||||||
for (Entity entity : analysedRowSection.getEntities()) {
|
for (Entity entity : analysedRowSection.getEntities()) {
|
||||||
entity.setPositionSequences(searchableRow.getSequences(entity.getWord()));
|
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
|
||||||
|
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
|
||||||
|
} else {
|
||||||
|
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
documentEntities.addAll(analysedRowSection.getEntities());
|
documentEntities.addAll(analysedRowSection.getEntities());
|
||||||
}
|
}
|
||||||
@ -90,32 +97,27 @@ public class EntityRedactionService {
|
|||||||
|
|
||||||
documentEntities.forEach(entity -> {
|
documentEntities.forEach(entity -> {
|
||||||
entity.getPositionSequences().forEach(sequence -> {
|
entity.getPositionSequences().forEach(sequence -> {
|
||||||
classifiedDoc.getEntities().computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>()).add(
|
classifiedDoc.getEntities()
|
||||||
new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List.of(sequence), entity.getHeadline(), entity.getMatchedRule())
|
.computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>())
|
||||||
);
|
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List
|
||||||
|
.of(sequence), entity.getHeadline(), entity.getMatchedRule()));
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
||||||
|
|
||||||
String normalizedInputString = searchableText.toString();
|
String inputString = searchableText.toString();
|
||||||
|
String lowercaseInputString = inputString.toLowerCase();
|
||||||
|
|
||||||
Set<Entity> found = new HashSet<>();
|
Set<Entity> found = new HashSet<>();
|
||||||
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
|
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
|
||||||
for (String value : entry.getValue()) {
|
|
||||||
int startIndex;
|
|
||||||
int stopIndex = 0;
|
|
||||||
do {
|
|
||||||
startIndex = normalizedInputString.indexOf(value, stopIndex);
|
|
||||||
stopIndex = startIndex + value.length();
|
|
||||||
|
|
||||||
if (startIndex > -1 &&
|
if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
|
||||||
(startIndex == 0 || Character.isWhitespace(normalizedInputString.charAt(startIndex - 1)) || isSeparator(normalizedInputString.charAt(startIndex - 1))) &&
|
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
|
||||||
(stopIndex == normalizedInputString.length() || isSeparator(normalizedInputString.charAt(stopIndex)))) {
|
} else {
|
||||||
found.add(new Entity(normalizedInputString.substring(startIndex, stopIndex), entry.getKey(), startIndex, stopIndex, headline));
|
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
|
||||||
}
|
|
||||||
} while (startIndex > -1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,19 +126,45 @@ public class EntityRedactionService {
|
|||||||
return found;
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Set<Entity> find(String inputString, Set<String> values, String type, String headline) {
|
||||||
|
|
||||||
|
Set<Entity> found = new HashSet<>();
|
||||||
|
for (String value : values) {
|
||||||
|
int startIndex;
|
||||||
|
int stopIndex = 0;
|
||||||
|
do {
|
||||||
|
startIndex = inputString.indexOf(value, stopIndex);
|
||||||
|
stopIndex = startIndex + value.length();
|
||||||
|
|
||||||
|
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||||
|
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||||
|
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
|
||||||
|
}
|
||||||
|
} while (startIndex > -1);
|
||||||
|
}
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean isSeparator(char c) {
|
private boolean isSeparator(char c) {
|
||||||
|
|
||||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||||
|
|
||||||
List<Entity> wordsToRemove = new ArrayList<>();
|
List<Entity> wordsToRemove = new ArrayList<>();
|
||||||
for (Entity word : entities) {
|
for (Entity word : entities) {
|
||||||
for (Entity inner : entities) {
|
for (Entity inner : entities) {
|
||||||
if (inner.getWord().length() < word.getWord().length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
|
if (inner.getWord().length() < word.getWord()
|
||||||
|
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
|
||||||
wordsToRemove.add(inner);
|
wordsToRemove.add(inner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
entities.removeAll(wordsToRemove);
|
entities.removeAll(wordsToRemove);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -71,6 +71,7 @@ public class RedactionIntegrationTest {
|
|||||||
private final Map<String, List<String>> dictionary = new HashMap<>();
|
private final Map<String, List<String>> dictionary = new HashMap<>();
|
||||||
private final Map<String, float[]> typeColorMap = new HashMap<>();
|
private final Map<String, float[]> typeColorMap = new HashMap<>();
|
||||||
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
private final Map<String, Boolean> hintTypeMap = new HashMap<>();
|
||||||
|
private final Map<String, Boolean> caseInSensitiveMap = new HashMap<>();
|
||||||
|
|
||||||
@TestConfiguration
|
@TestConfiguration
|
||||||
public static class RedactionIntegrationTestConfiguration {
|
public static class RedactionIntegrationTestConfiguration {
|
||||||
@ -82,7 +83,8 @@ public class RedactionIntegrationTest {
|
|||||||
|
|
||||||
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
|
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
|
||||||
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
|
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
|
||||||
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
|
kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources()
|
||||||
|
.newInputStreamResource(input));
|
||||||
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
|
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
|
||||||
kieBuilder.buildAll();
|
kieBuilder.buildAll();
|
||||||
KieModule kieModule = kieBuilder.getKieModule();
|
KieModule kieModule = kieBuilder.getKieModule();
|
||||||
@ -156,6 +158,12 @@ public class RedactionIntegrationTest {
|
|||||||
hintTypeMap.put(NAME_CODE, false);
|
hintTypeMap.put(NAME_CODE, false);
|
||||||
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
|
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
|
||||||
hintTypeMap.put(DEFAULT, true);
|
hintTypeMap.put(DEFAULT, true);
|
||||||
|
|
||||||
|
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
|
||||||
|
caseInSensitiveMap.put(ADDRESS_CODE, false);
|
||||||
|
caseInSensitiveMap.put(NAME_CODE, false);
|
||||||
|
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
|
||||||
|
caseInSensitiveMap.put(DEFAULT, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -166,14 +174,22 @@ public class RedactionIntegrationTest {
|
|||||||
.map(typeColor -> TypeResult.builder()
|
.map(typeColor -> TypeResult.builder()
|
||||||
.type(typeColor.getKey())
|
.type(typeColor.getKey())
|
||||||
.color(typeColor.getValue())
|
.color(typeColor.getValue())
|
||||||
.isHint(hintTypeMap.get(typeColor.getKey())).build())
|
.isHint(hintTypeMap.get(typeColor.getKey()))
|
||||||
|
.isCaseInsensitive(caseInSensitiveMap.get(typeColor.getKey()))
|
||||||
|
.build())
|
||||||
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private DictionaryResponse getDictionaryResponse(String type) {
|
private DictionaryResponse getDictionaryResponse(String type) {
|
||||||
|
|
||||||
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).isHint(hintTypeMap.get(type)).build();
|
return DictionaryResponse.builder()
|
||||||
|
.color(typeColorMap.get(type))
|
||||||
|
.entries(dictionary.get(type))
|
||||||
|
.isHint(hintTypeMap.get(type))
|
||||||
|
.isCaseInsensitive(caseInSensitiveMap.get(type))
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1 @@
|
|||||||
In Vitro
|
In Vitro
|
||||||
In vitro
|
|
||||||
in vitro
|
|
||||||
@ -100,15 +100,11 @@ Pseudacris triseriata
|
|||||||
poecilia reticulata
|
poecilia reticulata
|
||||||
poultry
|
poultry
|
||||||
quail
|
quail
|
||||||
rabbit
|
|
||||||
rabbits
|
|
||||||
rainbow trout
|
rainbow trout
|
||||||
Rana limnocharis
|
Rana limnocharis
|
||||||
rana
|
rana
|
||||||
limnocharis
|
limnocharis
|
||||||
rana pipiens
|
rana pipiens
|
||||||
rat
|
|
||||||
rats
|
|
||||||
reptile
|
reptile
|
||||||
reptiles
|
reptiles
|
||||||
ricefish
|
ricefish
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user