Merge branch 'feature/RED-10290' into 'master'
RED-10290: Improve SearchImplementation logic for dictionaries Closes RED-10290 See merge request redactmanager/redaction-service!553
This commit is contained in:
commit
41f824297c
@ -61,7 +61,9 @@ dependencies {
|
||||
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
implementation("org.ahocorasick:ahocorasick:0.6.3")
|
||||
implementation("org.ahocorasick:ahocorasick:0.9.0")
|
||||
implementation("com.hankcs:aho-corasick-double-array-trie:1.2.2")
|
||||
implementation("com.github.roklenarcic:aho-corasick:1.2")
|
||||
implementation("org.javassist:javassist:3.29.2-GA")
|
||||
|
||||
implementation("org.drools:drools-engine:${droolsVersion}")
|
||||
|
||||
@ -28,6 +28,8 @@ public class RedactionServiceSettings {
|
||||
|
||||
private boolean priorityMode;
|
||||
|
||||
private long firstLevelDictionaryCacheMaximumSize = 1000;
|
||||
|
||||
private long dictionaryCacheMaximumSize = 100;
|
||||
|
||||
private int dictionaryCacheExpireAfterAccessDays = 3;
|
||||
|
||||
@ -0,0 +1,130 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
|
||||
public abstract class AbstractDictionarySearch implements DictionarySearch {
|
||||
|
||||
protected final Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap;
|
||||
|
||||
|
||||
public AbstractDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
|
||||
|
||||
this.keyWordToIdentifiersMap = keyWordToIdentifiersMap;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchTextRange> getBoundaries(CharSequence text) {
|
||||
|
||||
TextContext textContext = new TextContext(text);
|
||||
return getMatchTextRangeStream(textContext);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region) {
|
||||
|
||||
CharSequence subText = text.subSequence(region.start(), region.end());
|
||||
TextContext textContext = new TextContext(subText, region.start());
|
||||
return getMatchTextRangeStream(textContext);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchTextRange> getBoundaries(TextBlock textBlock) {
|
||||
|
||||
return getBoundaries(textBlock, textBlock.getTextRange());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchPosition> getMatches(String text) {
|
||||
|
||||
TextContext textContext = new TextContext(text);
|
||||
List<MatchPosition> matches = new ArrayList<>();
|
||||
|
||||
parseText(textContext.getLowerText(), (begin, end, value) -> addMatchPositionsForHit(textContext, matches, new Hit(begin, end, value)));
|
||||
|
||||
return matches.stream();
|
||||
}
|
||||
|
||||
|
||||
private Stream<MatchTextRange> getMatchTextRangeStream(TextContext textContext) {
|
||||
|
||||
List<MatchTextRange> matches = new ArrayList<>();
|
||||
|
||||
parseText(textContext.getLowerText(), (begin, end, value) -> addMatchesForHit(textContext, matches, new Hit(begin, end, value)));
|
||||
|
||||
return matches.stream();
|
||||
}
|
||||
|
||||
|
||||
protected abstract void parseText(CharSequence text, HitHandler handler);
|
||||
|
||||
|
||||
protected void addMatchesForHit(TextContext textContext, List<MatchTextRange> matches, Hit hit) {
|
||||
|
||||
int start = textContext.getStart(hit.begin);
|
||||
int end = textContext.getEnd(hit.end);
|
||||
String matchedText = textContext.getMatchedText(hit.begin, hit.end);
|
||||
List<DictionaryIdentifierWithKeyword> idWithKeywords = hit.value;
|
||||
|
||||
for (DictionaryIdentifierWithKeyword idkw : idWithKeywords) {
|
||||
if (idkw.identifier().caseSensitive()) {
|
||||
if (matchedText.equals(idkw.keyword())) {
|
||||
matches.add(new MatchTextRange(idkw.identifier(), new TextRange(start, end)));
|
||||
}
|
||||
} else {
|
||||
matches.add(new MatchTextRange(idkw.identifier(), new TextRange(start, end)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected void addMatchPositionsForHit(TextContext textContext, List<MatchPosition> matches, Hit hit) {
|
||||
|
||||
int start = textContext.getStart(hit.begin);
|
||||
int end = textContext.getEnd(hit.end);
|
||||
String matchedText = textContext.getMatchedText(hit.begin, hit.end);
|
||||
List<DictionaryIdentifierWithKeyword> idWithKeywords = hit.value;
|
||||
|
||||
for (DictionaryIdentifierWithKeyword idkw : idWithKeywords) {
|
||||
MatchPosition matchPosition = new MatchPosition(idkw.identifier(), start, end);
|
||||
if (idkw.identifier().caseSensitive()) {
|
||||
if (matchedText.equals(idkw.keyword())) {
|
||||
matches.add(matchPosition);
|
||||
}
|
||||
} else {
|
||||
matches.add(matchPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected interface HitHandler {
|
||||
|
||||
void handle(int begin, int end, List<DictionaryIdentifierWithKeyword> value);
|
||||
|
||||
}
|
||||
|
||||
protected static class Hit {
|
||||
|
||||
final int begin;
|
||||
final int end;
|
||||
final List<DictionaryIdentifierWithKeyword> value;
|
||||
|
||||
|
||||
Hit(int begin, int end, List<DictionaryIdentifierWithKeyword> value) {
|
||||
|
||||
this.begin = begin;
|
||||
this.end = end;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.roklenarcic.util.strings.AhoCorasickMap;
|
||||
import com.roklenarcic.util.strings.MapMatchListener;
|
||||
import com.roklenarcic.util.strings.StringMap;
|
||||
|
||||
public class AhoCorasickMapDictionarySearch extends AbstractDictionarySearch {
|
||||
|
||||
private final StringMap<List<DictionaryIdentifierWithKeyword>> map;
|
||||
|
||||
|
||||
public AhoCorasickMapDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
|
||||
|
||||
super(keyWordToIdentifiersMap);
|
||||
map = new AhoCorasickMap<>(keyWordToIdentifiersMap.keySet(), keyWordToIdentifiersMap.values(), false);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void parseText(CharSequence text, HitHandler handler) {
|
||||
|
||||
MapMatchListener<List<DictionaryIdentifierWithKeyword>> listener = (haystack, startPosition, endPosition, value) -> {
|
||||
handler.handle(startPosition, endPosition, value);
|
||||
return true;
|
||||
};
|
||||
map.match(text.toString(), listener);
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,19 +2,14 @@ package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.utils.Patterns;
|
||||
@ -29,31 +24,70 @@ import lombok.Getter;
|
||||
@Data
|
||||
public class Dictionary {
|
||||
|
||||
@Getter
|
||||
private List<DictionaryModel> dictionaryModels;
|
||||
// todo: dossier and dossier template level DictionaryModels override each other
|
||||
// at the moment there are no problems because they always have the same rank / hint information
|
||||
// but it should be changed so that the localAccessMap contains all models
|
||||
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
|
||||
private final Map<String, Map<Level, DictionaryModel>> localAccessMap = new HashMap<>();
|
||||
|
||||
@Getter
|
||||
private DictionaryVersion version;
|
||||
private final DictionaryVersion version;
|
||||
|
||||
private final DictionarySearch dictionarySearch;
|
||||
|
||||
public enum Level {
|
||||
DOSSIER_TEMPLATE,
|
||||
DOSSIER
|
||||
}
|
||||
|
||||
|
||||
public Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version) {
|
||||
Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version, DictionarySearch dictionarySearch) {
|
||||
|
||||
this.dictionaryModels = dictionaryModels;
|
||||
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
|
||||
dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), Map.of(getLevel(dm.isDossierDictionary()), dm)));
|
||||
this.version = version;
|
||||
this.dictionarySearch = dictionarySearch;
|
||||
}
|
||||
|
||||
|
||||
private Level getLevel(boolean isDossierDictionary) {
|
||||
|
||||
return isDossierDictionary ? Level.DOSSIER : Level.DOSSIER_TEMPLATE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines the default level for a given type based on the levels present.
|
||||
* If both levels are present, it defaults to {@code Level.DOSSIER}.
|
||||
*
|
||||
* @param type The type to determine the default level for.
|
||||
* @return The default {@link Level} for the specified type.
|
||||
* @throws NotFoundException If the type is not found in the dictionary.
|
||||
*/
|
||||
private Level getDefaultLevel(String type) {
|
||||
|
||||
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
|
||||
if (levelMap == null || levelMap.isEmpty()) {
|
||||
throw new NotFoundException("Type: " + type + " is not found");
|
||||
}
|
||||
if (levelMap.containsKey(Level.DOSSIER)) {
|
||||
return Level.DOSSIER;
|
||||
} else {
|
||||
// Use whatever level is present
|
||||
return levelMap.keySet()
|
||||
.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public int getDictionaryRank(String type, Level level) {
|
||||
|
||||
if (!localAccessMap.containsKey(type)) {
|
||||
return 0;
|
||||
}
|
||||
DictionaryModel model = localAccessMap.get(type)
|
||||
.get(level);
|
||||
return model != null ? model.getRank() : 0;
|
||||
}
|
||||
|
||||
|
||||
public int getDictionaryRank(String type) {
|
||||
|
||||
if (!localAccessMap.containsKey(type)) {
|
||||
return 0;
|
||||
}
|
||||
return localAccessMap.get(type).getRank();
|
||||
return getDictionaryRank(type, getDefaultLevel(type));
|
||||
}
|
||||
|
||||
|
||||
@ -64,11 +98,21 @@ public class Dictionary {
|
||||
*/
|
||||
public boolean hasLocalEntries() {
|
||||
|
||||
return dictionaryModels.stream()
|
||||
return getDictionaryModels().stream()
|
||||
.anyMatch(dm -> !dm.getLocalEntriesWithMatchedRules().isEmpty());
|
||||
}
|
||||
|
||||
|
||||
public List<DictionaryModel> getDictionaryModels() {
|
||||
|
||||
return localAccessMap.values()
|
||||
.stream()
|
||||
.flatMap(levelDictionaryModelMap -> levelDictionaryModelMap.values()
|
||||
.stream())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public Set<String> getTypes() {
|
||||
|
||||
return localAccessMap.keySet();
|
||||
@ -76,56 +120,144 @@ public class Dictionary {
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the {@link DictionaryModel} of a specified type.
|
||||
* Retrieves the {@link DictionaryModel} of a specified type and level.
|
||||
*
|
||||
* @param type The type of dictionary model to retrieve.
|
||||
* @return The {@link DictionaryModel} of the specified type.
|
||||
* @throws NotFoundException If the specified type is not found in the dictionary.
|
||||
* @param type The type of dictionary model to retrieve.
|
||||
* @param level The level of the dictionary model to retrieve.
|
||||
* @return The {@link DictionaryModel} of the specified type and level.
|
||||
* @throws NotFoundException If the specified type or level is not found in the dictionary.
|
||||
*/
|
||||
public DictionaryModel getType(String type) {
|
||||
public DictionaryModel getType(String type, Level level) {
|
||||
|
||||
DictionaryModel model = localAccessMap.get(type);
|
||||
if (model == null) {
|
||||
throw new NotFoundException("Type: " + type + " is not found");
|
||||
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
|
||||
if (levelMap == null || !levelMap.containsKey(level)) {
|
||||
throw new NotFoundException("Type: " + type + " with level: " + level + " is not found");
|
||||
}
|
||||
return model;
|
||||
return levelMap.get(level);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the dictionary of a specific type is considered a hint.
|
||||
* Retrieves the {@link DictionaryModel} of a specified type at the default level.
|
||||
*
|
||||
* @param type The type of dictionary model to retrieve.
|
||||
* @return The {@link DictionaryModel} of the specified type at the default level.
|
||||
* @throws NotFoundException If the specified type is not found in the dictionary.
|
||||
*/
|
||||
public DictionaryModel getType(String type) {
|
||||
|
||||
return getType(type, getDefaultLevel(type));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the dictionary of a specific type and level is considered a hint.
|
||||
*
|
||||
* @param type The type of dictionary to check.
|
||||
* @param level The level of the dictionary to check.
|
||||
* @return true if the dictionary model is marked as a hint, false otherwise.
|
||||
*/
|
||||
public boolean isHint(String type, Level level) {
|
||||
|
||||
DictionaryModel model = localAccessMap.get(type)
|
||||
.get(level);
|
||||
return model != null && model.isHint();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the dictionary of a specific type is considered a hint at the default level.
|
||||
*
|
||||
* @param type The type of dictionary to check.
|
||||
* @return true if the dictionary model is marked as a hint, false otherwise.
|
||||
*/
|
||||
public boolean isHint(String type) {
|
||||
|
||||
DictionaryModel model = localAccessMap.get(type);
|
||||
if (model != null) {
|
||||
return model.isHint();
|
||||
}
|
||||
return false;
|
||||
return isHint(type, getDefaultLevel(type));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the dictionary of a specific type is case-insensitive.
|
||||
* Checks if the dictionary of a specific type and level is case-insensitive.
|
||||
*
|
||||
* @param type The type of dictionary to check.
|
||||
* @param level The level of the dictionary to check.
|
||||
* @return true if the dictionary is case-insensitive, false otherwise.
|
||||
*/
|
||||
public boolean isCaseInsensitiveDictionary(String type, Level level) {
|
||||
|
||||
DictionaryModel dictionaryModel = localAccessMap.get(type)
|
||||
.get(level);
|
||||
return dictionaryModel != null && dictionaryModel.isCaseInsensitive();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the dictionary of a specific type is case-insensitive at the default level.
|
||||
*
|
||||
* @param type The type of dictionary to check.
|
||||
* @return true if the dictionary is case-insensitive, false otherwise.
|
||||
*/
|
||||
public boolean isCaseInsensitiveDictionary(String type) {
|
||||
|
||||
DictionaryModel dictionaryModel = localAccessMap.get(type);
|
||||
if (dictionaryModel != null) {
|
||||
return dictionaryModel.isCaseInsensitive();
|
||||
}
|
||||
return false;
|
||||
return isCaseInsensitiveDictionary(type, getDefaultLevel(type));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds a local dictionary entry of a specific type.
|
||||
* Adds a local dictionary entry of a specific type and level.
|
||||
*
|
||||
* @param type The type of dictionary to add the entry to.
|
||||
* @param value The value of the entry.
|
||||
* @param matchedRules A collection of {@link MatchedRule} associated with the entry.
|
||||
* @param alsoAddLastname Indicates whether to also add the lastname separately as an entry.
|
||||
* @param level The level of the dictionary where the entry should be added.
|
||||
* @throws IllegalArgumentException If the specified type does not exist within the dictionary, if the type
|
||||
* does not have any local entries defined, or if the provided value is
|
||||
* blank. This ensures that only valid, non-empty entries
|
||||
* are added to the dictionary.
|
||||
*/
|
||||
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname, Level level) {
|
||||
|
||||
if (value.isBlank()) {
|
||||
return;
|
||||
}
|
||||
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
|
||||
if (levelMap == null || !levelMap.containsKey(level)) {
|
||||
throw new IllegalArgumentException(format("DictionaryModel of type %s with level %s does not exist", type, level));
|
||||
}
|
||||
DictionaryModel dictionaryModel = levelMap.get(level);
|
||||
if (dictionaryModel.getLocalEntriesWithMatchedRules() == null) {
|
||||
throw new IllegalArgumentException(format("DictionaryModel of type %s has no local Entries", type));
|
||||
}
|
||||
if (StringUtils.isEmpty(value)) {
|
||||
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
|
||||
}
|
||||
boolean isCaseInsensitive = dictionaryModel.isCaseInsensitive();
|
||||
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
|
||||
|
||||
String cleanedValue = value;
|
||||
if (isCaseInsensitive) {
|
||||
cleanedValue = cleanedValue.toLowerCase(Locale.US);
|
||||
}
|
||||
dictionaryModel.getLocalEntriesWithMatchedRules()
|
||||
.merge(cleanedValue.trim(),
|
||||
matchedRulesSet,
|
||||
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
|
||||
.collect(Collectors.toSet()));
|
||||
if (alsoAddLastname) {
|
||||
String lastname = cleanedValue.split(" ")[0];
|
||||
dictionaryModel.getLocalEntriesWithMatchedRules()
|
||||
.merge(lastname,
|
||||
matchedRulesSet,
|
||||
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds a local dictionary entry of a specific type at the default level.
|
||||
*
|
||||
* @param type The type of dictionary to add the entry to.
|
||||
* @param value The value of the entry.
|
||||
@ -138,40 +270,7 @@ public class Dictionary {
|
||||
*/
|
||||
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname) {
|
||||
|
||||
if (value.isBlank()) {
|
||||
return;
|
||||
}
|
||||
if (localAccessMap.get(type) == null) {
|
||||
throw new IllegalArgumentException(format("DictionaryModel of type %s does not exist", type));
|
||||
}
|
||||
if (localAccessMap.get(type).getLocalEntriesWithMatchedRules() == null) {
|
||||
throw new IllegalArgumentException(format("DictionaryModel of type %s has no local Entries", type));
|
||||
}
|
||||
if (StringUtils.isEmpty(value)) {
|
||||
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
|
||||
}
|
||||
boolean isCaseInsensitive = localAccessMap.get(type).isCaseInsensitive();
|
||||
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
|
||||
|
||||
String cleanedValue = value;
|
||||
if (isCaseInsensitive) {
|
||||
cleanedValue = cleanedValue.toLowerCase(Locale.US);
|
||||
}
|
||||
localAccessMap.get(type)
|
||||
.getLocalEntriesWithMatchedRules()
|
||||
.merge(cleanedValue.trim(),
|
||||
matchedRulesSet,
|
||||
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
|
||||
.collect(Collectors.toSet()));
|
||||
if (alsoAddLastname) {
|
||||
String lastname = cleanedValue.split(" ")[0];
|
||||
localAccessMap.get(type)
|
||||
.getLocalEntriesWithMatchedRules()
|
||||
.merge(lastname,
|
||||
matchedRulesSet,
|
||||
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
addLocalDictionaryEntry(type, value, matchedRules, alsoAddLastname, getDefaultLevel(type));
|
||||
}
|
||||
|
||||
|
||||
@ -179,10 +278,22 @@ public class Dictionary {
|
||||
* Recommends a text entity for inclusion in every dictionary model without separating the last name.
|
||||
*
|
||||
* @param textEntity The {@link TextEntity} to be recommended.
|
||||
* @param level The level of the dictionary where the recommendation should be added.
|
||||
*/
|
||||
public void recommendEverywhere(TextEntity textEntity, Level level) {
|
||||
|
||||
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false, level);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Recommends a text entity for inclusion in every dictionary model without separating the last name at the default level.
|
||||
*
|
||||
* @param textEntity The {@link TextEntity} to be recommended.
|
||||
*/
|
||||
public void recommendEverywhere(TextEntity textEntity) {
|
||||
|
||||
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false);
|
||||
recommendEverywhere(textEntity, getDefaultLevel(textEntity.type()));
|
||||
}
|
||||
|
||||
|
||||
@ -190,10 +301,22 @@ public class Dictionary {
|
||||
* Recommends a text entity for inclusion in every dictionary model with the last name added separately.
|
||||
*
|
||||
* @param textEntity The {@link TextEntity} to be recommended.
|
||||
* @param level The level of the dictionary where the recommendation should be added.
|
||||
*/
|
||||
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity, Level level) {
|
||||
|
||||
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true, level);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Recommends a text entity for inclusion in every dictionary model with the last name added separately at the default level.
|
||||
*
|
||||
* @param textEntity The {@link TextEntity} to be recommended.
|
||||
*/
|
||||
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity) {
|
||||
|
||||
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true);
|
||||
recommendEverywhereWithLastNameSeparately(textEntity, getDefaultLevel(textEntity.type()));
|
||||
}
|
||||
|
||||
|
||||
@ -201,11 +324,22 @@ public class Dictionary {
|
||||
* Adds multiple author names contained within a text entity as recommendations in the dictionary.
|
||||
*
|
||||
* @param textEntity The {@link TextEntity} containing author names to be added.
|
||||
* @param level The level of the dictionary where the recommendations should be added.
|
||||
*/
|
||||
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity, Level level) {
|
||||
|
||||
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true, level));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds multiple author names contained within a text entity as recommendations in the dictionary at the default level.
|
||||
*
|
||||
* @param textEntity The {@link TextEntity} containing author names to be added.
|
||||
*/
|
||||
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity) {
|
||||
|
||||
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true));
|
||||
|
||||
addMultipleAuthorsAsRecommendation(textEntity, getDefaultLevel(textEntity.type()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,90 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DictionaryFactory {
|
||||
|
||||
@SneakyThrows
|
||||
public Dictionary create(List<DictionaryModel> dictionaryModels, DictionaryVersion dictionaryVersion) {
|
||||
|
||||
Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap = computeStringIdentifiersMap(dictionaryModels);
|
||||
DictionarySearch dictionarySearch = getDictionarySearch(keyWordToIdentifiersMap);
|
||||
|
||||
return new Dictionary(dictionaryModels, dictionaryVersion, dictionarySearch);
|
||||
}
|
||||
|
||||
|
||||
private static DictionarySearch getDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
|
||||
|
||||
// a more sophisticated selection of the dictionarySearch could be done here
|
||||
// but as we do not have the need to fine-tune at the moment we use the all-rounder solution, which is the AhoCoraSickMapDictionarySearch
|
||||
// based on this repository https://github.com/RokLenarcic/AhoCorasick
|
||||
|
||||
// This is an outline how a more complex dictionarySearch decision could be made:
|
||||
// if (!redactionServiceSettings.isPriorityMode() && keyWordToIdentifiersMap.keySet().size() < 50_000) {
|
||||
// dictionarySearch = new DoubleArrayTrieDictionarySearch(keyWordToIdentifiersMap);
|
||||
// } else {
|
||||
// dictionarySearch = new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
|
||||
// }
|
||||
|
||||
return new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
|
||||
}
|
||||
|
||||
|
||||
protected static Map<String, List<DictionaryIdentifierWithKeyword>> computeStringIdentifiersMap(List<DictionaryModel> dictionaryModels) {
|
||||
|
||||
Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap = new HashMap<>();
|
||||
|
||||
for (DictionaryModel model : dictionaryModels) {
|
||||
|
||||
// Add entries for different entity types
|
||||
addEntriesToMap(stringToIdentifiersMap, model, model.isHint() ? EntityType.HINT : EntityType.ENTITY, model.getEntries(), false);
|
||||
addEntriesToMap(stringToIdentifiersMap, model, EntityType.FALSE_POSITIVE, model.getFalsePositives(), false);
|
||||
addEntriesToMap(stringToIdentifiersMap, model, EntityType.FALSE_RECOMMENDATION, model.getFalseRecommendations(), false);
|
||||
|
||||
if (model.isDossierDictionary()) {
|
||||
addEntriesToMap(stringToIdentifiersMap, model, EntityType.DICTIONARY_REMOVAL, model.getEntries(), true);
|
||||
}
|
||||
}
|
||||
|
||||
return stringToIdentifiersMap;
|
||||
}
|
||||
|
||||
|
||||
private static void addEntriesToMap(Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap,
|
||||
DictionaryModel model,
|
||||
EntityType entityType,
|
||||
Set<DictionaryEntryModel> entries,
|
||||
boolean isDeleted) {
|
||||
|
||||
DictionaryIdentifier identifier = new DictionaryIdentifier(model.getType(), entityType, model.isDossierDictionary(), !model.isCaseInsensitive());
|
||||
|
||||
List<String> values = entries.stream()
|
||||
.filter(entry -> entry.isDeleted() == isDeleted)
|
||||
.map(DictionaryEntry::getValue)
|
||||
.toList();
|
||||
|
||||
for (String value : values) {
|
||||
DictionaryIdentifierWithKeyword idWithKeyword = new DictionaryIdentifierWithKeyword(identifier, value);
|
||||
String key = value.toLowerCase(Locale.ROOT);
|
||||
stringToIdentifiersMap.computeIfAbsent(key, k -> new ArrayList<>()).add(idWithKeyword);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
||||
|
||||
public record DictionaryIdentifier(String type, EntityType entityType, boolean dossierDictionaryEntry, boolean caseSensitive) {
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,51 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
import org.ahocorasick.trie.PayloadTrie;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
public final class DictionaryIdentifierTrie {
|
||||
private final PayloadTrie<DictionaryIdentifier> trie;
|
||||
|
||||
private DictionaryIdentifierTrie(PayloadTrie<DictionaryIdentifier> trie) {
|
||||
this.trie = trie;
|
||||
}
|
||||
|
||||
public static class DictionaryIdentifierTrieBuilder {
|
||||
private final PayloadTrie.PayloadTrieBuilder<DictionaryIdentifier> builder;
|
||||
|
||||
public DictionaryIdentifierTrieBuilder() {
|
||||
this.builder = PayloadTrie.builder();
|
||||
}
|
||||
|
||||
public DictionaryIdentifierTrieBuilder ignoreCase() {
|
||||
builder.ignoreCase();
|
||||
return this;
|
||||
}
|
||||
|
||||
public DictionaryIdentifierTrieBuilder addKeyword(String keyword, DictionaryIdentifier payload) {
|
||||
builder.addKeyword(keyword, payload);
|
||||
return this;
|
||||
}
|
||||
|
||||
public DictionaryIdentifierTrieBuilder addKeywords(Collection<String> keywords, DictionaryIdentifier payload) {
|
||||
for (String keyword : keywords) {
|
||||
builder.addKeyword(keyword, payload);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public DictionaryIdentifierTrie build() {
|
||||
return new DictionaryIdentifierTrie(builder.build());
|
||||
}
|
||||
}
|
||||
|
||||
public Collection<PayloadEmit<DictionaryIdentifier>> parseText(CharSequence text) {
|
||||
return trie.parseText(text);
|
||||
}
|
||||
|
||||
public boolean containsMatch(CharSequence text) {
|
||||
return trie.containsMatch(text);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
public record DictionaryIdentifierWithKeyword(DictionaryIdentifier identifier, String keyword) {
|
||||
|
||||
}
|
||||
@ -1,13 +1,12 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.type.Type;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
|
||||
|
||||
import lombok.Data;
|
||||
@ -21,7 +20,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
*/
|
||||
@Data
|
||||
@Slf4j
|
||||
public class DictionaryModel implements Serializable {
|
||||
public class DictionaryModel implements Cloneable {
|
||||
|
||||
private final String type;
|
||||
private final int rank;
|
||||
@ -33,13 +32,8 @@ public class DictionaryModel implements Serializable {
|
||||
private final Set<DictionaryEntryModel> falsePositives;
|
||||
private final Set<DictionaryEntryModel> falseRecommendations;
|
||||
|
||||
private transient SearchImplementation entriesSearch;
|
||||
private transient SearchImplementation deletionEntriesSearch;
|
||||
private transient SearchImplementation falsePositiveSearch;
|
||||
private transient SearchImplementation falseRecommendationsSearch;
|
||||
|
||||
private final HashMap<String, Set<MatchedRule>> localEntriesWithMatchedRules = new HashMap<>();
|
||||
private transient SearchImplementation localSearch;
|
||||
private SearchImplementation localSearch;
|
||||
|
||||
|
||||
/**
|
||||
@ -91,74 +85,6 @@ public class DictionaryModel implements Serializable {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the search implementation for non-deleted dictionary entries.
|
||||
*
|
||||
* @return The {@link SearchImplementation} for non-deleted dictionary entries.
|
||||
*/
|
||||
public SearchImplementation getEntriesSearch() {
|
||||
|
||||
if (entriesSearch == null) {
|
||||
this.entriesSearch = new SearchImplementation(this.entries.stream()
|
||||
.filter(e -> !e.isDeleted())
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return entriesSearch;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the search implementation for deleted dictionary entries.
|
||||
*
|
||||
* @return The {@link SearchImplementation} for deleted dictionary entries.
|
||||
*/
|
||||
public SearchImplementation getDeletionEntriesSearch() {
|
||||
|
||||
if (deletionEntriesSearch == null) {
|
||||
this.deletionEntriesSearch = new SearchImplementation(this.entries.stream()
|
||||
.filter(DictionaryEntry::isDeleted)
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return deletionEntriesSearch;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the search implementation for non-deleted false positive entries.
|
||||
*
|
||||
* @return The {@link SearchImplementation} for non-deleted false positive entries.
|
||||
*/
|
||||
public SearchImplementation getFalsePositiveSearch() {
|
||||
|
||||
if (falsePositiveSearch == null) {
|
||||
this.falsePositiveSearch = new SearchImplementation(this.falsePositives.stream()
|
||||
.filter(e -> !e.isDeleted())
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return falsePositiveSearch;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the search implementation for non-deleted false recommendation entries.
|
||||
*
|
||||
* @return The {@link SearchImplementation} for non-deleted false recommendation entries.
|
||||
*/
|
||||
public SearchImplementation getFalseRecommendationsSearch() {
|
||||
|
||||
if (falseRecommendationsSearch == null) {
|
||||
this.falseRecommendationsSearch = new SearchImplementation(this.falseRecommendations.stream()
|
||||
.filter(e -> !e.isDeleted())
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toList()), caseInsensitive);
|
||||
}
|
||||
return falseRecommendationsSearch;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the matched rules for a given value from the local dictionary entries.
|
||||
* The value is processed based on the case sensitivity of the dictionary.
|
||||
@ -172,4 +98,149 @@ public class DictionaryModel implements Serializable {
|
||||
return localEntriesWithMatchedRules.get(cleanedValue);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DictionaryModel clone() {
|
||||
|
||||
try {
|
||||
DictionaryModel cloned = (DictionaryModel) super.clone();
|
||||
|
||||
cloned.localSearch = null;
|
||||
|
||||
return cloned;
|
||||
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new AssertionError("Cloning not supported", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addNewEntries(long versionThreshold, Set<DictionaryIncrementValue> newValues) {
|
||||
|
||||
getEntries().forEach(entry -> {
|
||||
if (entry.getVersion() > versionThreshold) {
|
||||
newValues.add(new DictionaryIncrementValue(entry.getValue(), isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
getFalsePositives().forEach(entry -> {
|
||||
if (entry.getVersion() > versionThreshold) {
|
||||
newValues.add(new DictionaryIncrementValue(entry.getValue(), isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
getFalseRecommendations().forEach(entry -> {
|
||||
if (entry.getVersion() > versionThreshold) {
|
||||
newValues.add(new DictionaryIncrementValue(entry.getValue(), isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public void handleOldEntries(Type newType,
|
||||
DictionaryEntries newEntries,
|
||||
Set<DictionaryEntryModel> combinedEntries,
|
||||
Set<DictionaryEntryModel> combinedFalsePositives,
|
||||
Set<DictionaryEntryModel> combinedFalseRecommendations) {
|
||||
|
||||
if (isCaseInsensitive() && !newType.isCaseInsensitive()) {
|
||||
// Compute new entries' values in lowercase once
|
||||
Set<String> newEntryValuesLower = newEntries.getEntries()
|
||||
.stream()
|
||||
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedEntries.addAll(getEntries()
|
||||
.stream()
|
||||
.filter(f -> !newEntryValuesLower.contains(f.getValue()))
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
// Similarly for false positives
|
||||
Set<String> newFalsePositivesValuesLower = newEntries.getFalsePositives()
|
||||
.stream()
|
||||
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedFalsePositives.addAll(getFalsePositives()
|
||||
.stream()
|
||||
.filter(f -> !newFalsePositivesValuesLower.contains(f.getValue()))
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
// Similarly for false recommendations
|
||||
Set<String> newFalseRecommendationsValuesLower = newEntries.getFalseRecommendations()
|
||||
.stream()
|
||||
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedFalseRecommendations.addAll(getFalseRecommendations()
|
||||
.stream()
|
||||
.filter(f -> !newFalseRecommendationsValuesLower.contains(f.getValue()))
|
||||
.collect(Collectors.toSet()));
|
||||
} else if (!isCaseInsensitive() && newType.isCaseInsensitive()) {
|
||||
// Compute new entries' values in lowercase once
|
||||
Set<String> newEntryValuesLower = newEntries.getEntries()
|
||||
.stream()
|
||||
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedEntries.addAll(getEntries()
|
||||
.stream()
|
||||
.filter(f -> !newEntryValuesLower.contains(f.getValue().toLowerCase(Locale.ROOT)))
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
// Similarly for false positives
|
||||
Set<String> newFalsePositivesValuesLower = newEntries.getFalsePositives()
|
||||
.stream()
|
||||
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedFalsePositives.addAll(getFalsePositives()
|
||||
.stream()
|
||||
.filter(f -> !newFalsePositivesValuesLower.contains(f.getValue().toLowerCase(Locale.ROOT)))
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
// Similarly for false recommendations
|
||||
Set<String> newFalseRecommendationsValuesLower = newEntries.getFalseRecommendations()
|
||||
.stream()
|
||||
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedFalseRecommendations.addAll(getFalseRecommendations()
|
||||
.stream()
|
||||
.filter(f -> !newFalseRecommendationsValuesLower.contains(f.getValue().toLowerCase(Locale.ROOT)))
|
||||
.collect(Collectors.toSet()));
|
||||
} else {
|
||||
// Both have the same case sensitivity
|
||||
Set<String> newEntryValues = newEntries.getEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedEntries.addAll(getEntries()
|
||||
.stream()
|
||||
.filter(f -> !newEntryValues.contains(f.getValue()))
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
// Similarly for false positives
|
||||
Set<String> newFalsePositivesValues = newEntries.getFalsePositives()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedFalsePositives.addAll(getFalsePositives()
|
||||
.stream()
|
||||
.filter(f -> !newFalsePositivesValues.contains(f.getValue()))
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
// Similarly for false recommendations
|
||||
Set<String> newFalseRecommendationsValues = newEntries.getFalseRecommendations()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
combinedFalseRecommendations.addAll(getFalseRecommendations()
|
||||
.stream()
|
||||
.filter(f -> !newFalseRecommendationsValues.contains(f.getValue()))
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,86 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
|
||||
/**
|
||||
* Common interface for dictionary search implementations.
|
||||
*/
|
||||
public interface DictionarySearch {
|
||||
|
||||
/**
|
||||
* Retrieves a list of match boundaries within the given text.
|
||||
*
|
||||
* @param text The text to search within.
|
||||
* @return A list of MatchTextRange representing the boundaries of matches.
|
||||
*/
|
||||
default List<MatchTextRange> getBoundariesAsList(CharSequence text) {
|
||||
return getBoundaries(text).toList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a stream of match boundaries within the given text.
|
||||
*
|
||||
* @param text The text to search within.
|
||||
* @return A stream of MatchTextRange representing the boundaries of matches.
|
||||
*/
|
||||
Stream<MatchTextRange> getBoundaries(CharSequence text);
|
||||
|
||||
/**
|
||||
* Retrieves a list of match boundaries within a specified region of the text.
|
||||
*
|
||||
* @param text The text to search within.
|
||||
* @param region The specific region of the text to search.
|
||||
* @return A list of MatchTextRange representing the boundaries of matches.
|
||||
*/
|
||||
default List<MatchTextRange> getBoundariesAsList(CharSequence text, TextRange region) {
|
||||
return getBoundaries(text, region).toList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a stream of match boundaries within a specified region of the text.
|
||||
*
|
||||
* @param text The text to search within.
|
||||
* @param region The specific region of the text to search.
|
||||
* @return A stream of MatchTextRange representing the boundaries of matches.
|
||||
*/
|
||||
Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region);
|
||||
|
||||
/**
|
||||
* Retrieves a stream of match boundaries within the given TextBlock.
|
||||
*
|
||||
* @param textBlock The TextBlock to search within.
|
||||
* @return A stream of MatchTextRange representing the boundaries of matches.
|
||||
*/
|
||||
Stream<MatchTextRange> getBoundaries(TextBlock textBlock);
|
||||
|
||||
/**
|
||||
* Retrieves a list of match positions within the given text.
|
||||
*
|
||||
* @param text The text to search within.
|
||||
* @return A list of MatchPosition representing the positions of matches.
|
||||
*/
|
||||
default List<MatchPosition> getMatchesAsList(String text) {
|
||||
return getMatches(text).toList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a stream of match positions within the given text.
|
||||
*
|
||||
* @param text The text to search within.
|
||||
* @return A stream of MatchPosition representing the positions of matches.
|
||||
*/
|
||||
Stream<MatchPosition> getMatches(String text);
|
||||
|
||||
/**
|
||||
* Record representing the range of matched text along with its identifier.
|
||||
*/
|
||||
record MatchTextRange(DictionaryIdentifier identifier, TextRange textRange) {}
|
||||
|
||||
/**
|
||||
* Record representing the start and end positions of a match along with its identifier.
|
||||
*/
|
||||
record MatchPosition(DictionaryIdentifier identifier, int startIndex, int endIndex) {}
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
|
||||
|
||||
public class DoubleArrayTrieDictionarySearch extends AbstractDictionarySearch {
|
||||
|
||||
private final AhoCorasickDoubleArrayTrie<List<DictionaryIdentifierWithKeyword>> trie;
|
||||
|
||||
|
||||
public DoubleArrayTrieDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
|
||||
|
||||
super(keyWordToIdentifiersMap);
|
||||
trie = new AhoCorasickDoubleArrayTrie<>();
|
||||
trie.build(keyWordToIdentifiersMap);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
protected void parseText(CharSequence text, HitHandler handler) {
|
||||
|
||||
List<AhoCorasickDoubleArrayTrie.Hit<List<DictionaryIdentifierWithKeyword>>> hits = trie.parseText(text);
|
||||
for (AhoCorasickDoubleArrayTrie.Hit<List<DictionaryIdentifierWithKeyword>> hit : hits) {
|
||||
handler.handle(hit.begin, hit.end, hit.value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,138 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
|
||||
public class DoubleTrieDictionarySearch implements DictionarySearch {
|
||||
|
||||
private final Map<DictionaryIdentifier, List<String>> caseSensitiveEntries = new HashMap<>();
|
||||
private final Map<DictionaryIdentifier, List<String>> caseInsensitiveEntries = new HashMap<>();
|
||||
private final DictionaryIdentifierTrie caseSensitiveTrie;
|
||||
private final DictionaryIdentifierTrie caseInsensitiveTrie;
|
||||
|
||||
|
||||
public DoubleTrieDictionarySearch(Map<DictionaryIdentifier, List<String>> dictionaryValues) {
|
||||
|
||||
for (Map.Entry<DictionaryIdentifier, List<String>> entry : dictionaryValues.entrySet()) {
|
||||
DictionaryIdentifier identifier = entry.getKey();
|
||||
List<String> values = entry.getValue();
|
||||
if (identifier.caseSensitive()) {
|
||||
caseSensitiveEntries.put(identifier, values);
|
||||
} else {
|
||||
caseInsensitiveEntries.put(identifier, values);
|
||||
}
|
||||
}
|
||||
|
||||
this.caseSensitiveTrie = createTrie(caseSensitiveEntries, false);
|
||||
this.caseInsensitiveTrie = createTrie(caseInsensitiveEntries, true);
|
||||
}
|
||||
|
||||
|
||||
private DictionaryIdentifierTrie createTrie(Map<DictionaryIdentifier, List<String>> entries, boolean ignoreCase) {
|
||||
|
||||
if (entries.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
DictionaryIdentifierTrie.DictionaryIdentifierTrieBuilder builder = new DictionaryIdentifierTrie.DictionaryIdentifierTrieBuilder();
|
||||
if (ignoreCase) {
|
||||
builder.ignoreCase();
|
||||
}
|
||||
entries.forEach((identifier, values) -> {
|
||||
for (String value : values) {
|
||||
builder.addKeyword(value, identifier);
|
||||
}
|
||||
});
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
||||
public boolean atLeastOneMatches(String text) {
|
||||
|
||||
if (!caseSensitiveEntries.isEmpty() && caseSensitiveTrie != null && caseSensitiveTrie.containsMatch(text)) {
|
||||
return true;
|
||||
}
|
||||
return !caseInsensitiveEntries.isEmpty() && caseInsensitiveTrie != null && caseInsensitiveTrie.containsMatch(text);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchTextRange> getBoundaries(CharSequence text) {
|
||||
|
||||
List<MatchTextRange> matches = new ArrayList<>();
|
||||
addMatchTextRangesForTrie(caseSensitiveEntries, caseSensitiveTrie, matches, text);
|
||||
addMatchTextRangesForTrie(caseInsensitiveEntries, caseInsensitiveTrie, matches, text);
|
||||
return matches.stream();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchTextRange> getBoundaries(TextBlock textBlock) {
|
||||
|
||||
return getBoundaries(textBlock, textBlock.getTextRange());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region) {
|
||||
|
||||
List<MatchTextRange> matches = new ArrayList<>();
|
||||
addMatchTextRangesForTrie(text, region, matches, caseSensitiveEntries, caseSensitiveTrie);
|
||||
addMatchTextRangesForTrie(text, region, matches, caseInsensitiveEntries, caseInsensitiveTrie);
|
||||
return matches.stream();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Stream<MatchPosition> getMatches(String text) {
|
||||
|
||||
List<MatchPosition> matches = new ArrayList<>();
|
||||
addMatchPositionsForTrie(caseSensitiveEntries, caseSensitiveTrie, matches, text);
|
||||
addMatchPositionsForTrie(caseInsensitiveEntries, caseInsensitiveTrie, matches, text);
|
||||
return matches.stream();
|
||||
}
|
||||
|
||||
|
||||
private void addMatchTextRangesForTrie(Map<DictionaryIdentifier, List<String>> entries, DictionaryIdentifierTrie trie, List<MatchTextRange> matches, CharSequence text) {
|
||||
|
||||
if (!entries.isEmpty() && trie != null) {
|
||||
matches.addAll(trie.parseText(text)
|
||||
.stream()
|
||||
.map(r -> new MatchTextRange(r.getPayload(), new TextRange(r.getStart(), r.getEnd() + 1)))
|
||||
.toList());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addMatchTextRangesForTrie(CharSequence text,
|
||||
TextRange region,
|
||||
List<MatchTextRange> matches,
|
||||
Map<DictionaryIdentifier, List<String>> entries,
|
||||
DictionaryIdentifierTrie trie) {
|
||||
|
||||
if (!entries.isEmpty() && trie != null) {
|
||||
CharSequence subSequence = text.subSequence(region.start(), region.end());
|
||||
matches.addAll(trie.parseText(subSequence)
|
||||
.stream()
|
||||
.map(r -> new MatchTextRange(r.getPayload(), new TextRange(r.getStart() + region.start(), r.getEnd() + region.start() + 1)))
|
||||
.toList());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addMatchPositionsForTrie(Map<DictionaryIdentifier, List<String>> entries, DictionaryIdentifierTrie trie, List<MatchPosition> matches, String text) {
|
||||
|
||||
if (!entries.isEmpty() && trie != null) {
|
||||
matches.addAll(trie.parseText(text)
|
||||
.stream()
|
||||
.map(r -> new MatchPosition(r.getPayload(), r.getStart(), r.getEnd() + 1))
|
||||
.toList());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,46 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.dictionary;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class TextContext {
|
||||
|
||||
private final CharSequence text;
|
||||
@Getter
|
||||
private final String lowerText;
|
||||
private final int offset;
|
||||
|
||||
|
||||
TextContext(CharSequence text, int offset) {
|
||||
|
||||
this.text = text;
|
||||
this.lowerText = text.toString().toLowerCase(Locale.ROOT);
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
|
||||
TextContext(CharSequence text) {
|
||||
|
||||
this(text, 0);
|
||||
}
|
||||
|
||||
|
||||
public int getStart(int hitBegin) {
|
||||
|
||||
return hitBegin + offset;
|
||||
}
|
||||
|
||||
|
||||
public int getEnd(int hitEnd) {
|
||||
|
||||
return hitEnd + offset;
|
||||
}
|
||||
|
||||
|
||||
public String getMatchedText(int hitBegin, int hitEnd) {
|
||||
|
||||
return text.subSequence(hitBegin, hitEnd).toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -8,7 +8,6 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ -104,7 +103,7 @@ public class AnalysisPreparationService {
|
||||
|
||||
CompletableFuture.allOf(kieWrapperEntityRulesFuture, kieWrapperComponentRulesFuture, documentFuture, importedRedactionsFuture, nerEntitiesFuture).join();
|
||||
|
||||
Dictionary dictionary = getDictionary(analyzeRequest);
|
||||
Dictionary dictionary = dictionaryService.getDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
|
||||
|
||||
Document document = documentFuture.get();
|
||||
ImportedRedactions importedRedactions = importedRedactionsFuture.get();
|
||||
@ -195,7 +194,7 @@ public class AnalysisPreparationService {
|
||||
taskExecutor);
|
||||
|
||||
CompletableFuture<DictionaryAndNotFoundEntries> dictionaryAndNotFoundEntriesCompletableFuture = CompletableFuture.supplyAsync(() -> {
|
||||
Dictionary dictionary = getDictionary(analyzeRequest);
|
||||
Dictionary dictionary = dictionaryService.getDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
|
||||
NotFoundEntries notFoundEntries = getNotFoundEntries(analyzeRequest, reanalysisSetupData.document(), reanalysisInitialProcessingData.importedRedactions());
|
||||
return new DictionaryAndNotFoundEntries(dictionary, notFoundEntries.notFoundManualRedactionEntries(), notFoundEntries.notFoundImportedEntries());
|
||||
}, taskExecutor);
|
||||
@ -253,15 +252,6 @@ public class AnalysisPreparationService {
|
||||
}
|
||||
|
||||
|
||||
private Dictionary getDictionary(AnalyzeRequest analyzeRequest) {
|
||||
|
||||
dictionaryService.updateDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
|
||||
log.info("Updated Dictionaries for file {} in dossier {}", analyzeRequest.getFileId(), analyzeRequest.getDossierId());
|
||||
return dictionary;
|
||||
}
|
||||
|
||||
|
||||
private NotFoundEntries getNotFoundEntries(AnalyzeRequest analyzeRequest, Document document, ImportedRedactions importedRedactions) {
|
||||
|
||||
var notFoundManualRedactionEntries = manualRedactionEntryService.addManualRedactionEntriesAndReturnNotFoundEntries(analyzeRequest,
|
||||
|
||||
@ -0,0 +1,107 @@
|
||||
package com.iqser.red.service.redaction.v1.server.service;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.common.cache.CacheLoader;
|
||||
import com.google.common.cache.LoadingCache;
|
||||
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryRepresentation;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.TenantDictionary;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class DictionaryCacheService {
|
||||
|
||||
private final LoadingCache<String, TenantDictionary> tenantDictionaryCache;
|
||||
private final Cache<DictionaryCacheKey, Dictionary> dictionaryCache;
|
||||
|
||||
|
||||
public DictionaryCacheService(RedactionServiceSettings settings) {
|
||||
|
||||
tenantDictionaryCache = CacheBuilder.newBuilder()
|
||||
.maximumSize(settings.getDictionaryCacheMaximumSize())
|
||||
.expireAfterAccess(settings.getDictionaryCacheExpireAfterAccessDays(), TimeUnit.DAYS)
|
||||
.build(new CacheLoader<>() {
|
||||
public TenantDictionary load(String key) {
|
||||
|
||||
return new TenantDictionary();
|
||||
}
|
||||
});
|
||||
|
||||
dictionaryCache = CacheBuilder.newBuilder()
|
||||
.maximumSize(settings.getFirstLevelDictionaryCacheMaximumSize())
|
||||
.expireAfterAccess(settings.getDictionaryCacheExpireAfterAccessDays(), TimeUnit.DAYS)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public void clearAllCaches() {
|
||||
|
||||
tenantDictionaryCache.invalidateAll();
|
||||
dictionaryCache.invalidateAll();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public DictionaryRepresentation getDossierTemplateDictionary(String dossierTemplateId) {
|
||||
|
||||
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate()
|
||||
.get(dossierTemplateId);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public DictionaryRepresentation getDossierDictionary(String dossierId) {
|
||||
|
||||
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier()
|
||||
.get(dossierId);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void addDictionaryRepresentationForDossierTemplate(String dossierTemplateId, DictionaryRepresentation dictionaryRepresentation) {
|
||||
|
||||
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate().put(dossierTemplateId, dictionaryRepresentation);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void addDictionaryRepresentationForDossier(String dossierId, DictionaryRepresentation dictionaryRepresentation) {
|
||||
|
||||
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier().put(dossierId, dictionaryRepresentation);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<Dictionary> getDictionary(String tenantId, String dossierId) {
|
||||
|
||||
return Optional.ofNullable(dictionaryCache.getIfPresent(new DictionaryCacheKey(tenantId, dossierId)));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void putDictionary(String tenantId, String dossierId, Dictionary newDictionary) {
|
||||
|
||||
dictionaryCache.put(new DictionaryCacheKey(tenantId, dossierId), newDictionary);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public record DictionaryCacheKey(String tenantId, String dossierId) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -38,40 +38,21 @@ public class DictionarySearchService {
|
||||
@Observed(name = "DictionarySearchService", contextualName = "add-dictionary-entries")
|
||||
public void addDictionaryEntities(Dictionary dictionary, SemanticNode node) {
|
||||
|
||||
dictionary.getDictionaryModels()
|
||||
.forEach(model -> {
|
||||
bySearchImplementationAsDictionary(model.getEntriesSearch(),
|
||||
model.getType(),
|
||||
model.isHint() ? EntityType.HINT : EntityType.ENTITY,
|
||||
node,
|
||||
model.isDossierDictionary());
|
||||
bySearchImplementationAsDictionary(model.getFalsePositiveSearch(), model.getType(), EntityType.FALSE_POSITIVE, node, model.isDossierDictionary());
|
||||
bySearchImplementationAsDictionary(model.getFalseRecommendationsSearch(), model.getType(), EntityType.FALSE_RECOMMENDATION, node, model.isDossierDictionary());
|
||||
if (model.isDossierDictionary()) {
|
||||
bySearchImplementationAsDictionary(model.getDeletionEntriesSearch(), model.getType(), EntityType.DICTIONARY_REMOVAL, node, model.isDossierDictionary());
|
||||
}
|
||||
EntityCreationService entityCreationService = new EntityCreationService(entityEnrichmentService);
|
||||
dictionary.getDictionarySearch().getBoundaries(node.getTextBlock())
|
||||
.filter(boundary -> entityCreationService.isValidEntityTextRange(node.getTextBlock(), boundary.textRange()))
|
||||
.forEach(match -> {
|
||||
|
||||
Set<Engine> engines = match.identifier().dossierDictionaryEntry() ? Set.of(Engine.DOSSIER_DICTIONARY) : Set.of(Engine.DICTIONARY);
|
||||
entityCreationService.byTextRangeWithEngine(match.textRange(), match.identifier().type(), match.identifier().entityType(), node, engines)
|
||||
.ifPresent(entity -> {
|
||||
entity.setDictionaryEntry(true);
|
||||
entity.setDossierDictionaryEntry(match.identifier().dossierDictionaryEntry());
|
||||
if (match.identifier().entityType().equals(EntityType.DICTIONARY_REMOVAL)) {
|
||||
entity.ignore("DICT.0.0", "Ignore Dossier Dictionary Entity with DICTIONARY_REMOVAL entity type");
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public void bySearchImplementationAsDictionary(SearchImplementation searchImplementation,
|
||||
String type,
|
||||
EntityType entityType,
|
||||
SemanticNode node,
|
||||
boolean isDossierDictionaryEntry) {
|
||||
|
||||
Set<Engine> engines = isDossierDictionaryEntry ? Set.of(Engine.DOSSIER_DICTIONARY) : Set.of(Engine.DICTIONARY);
|
||||
EntityCreationService entityCreationService = new EntityCreationService(entityEnrichmentService);
|
||||
searchImplementation.getBoundaries(node.getTextBlock())
|
||||
.filter(boundary -> entityCreationService.isValidEntityTextRange(node.getTextBlock(), boundary))
|
||||
.forEach(bounds -> entityCreationService.byTextRangeWithEngine(bounds, type, entityType, node, engines)
|
||||
.ifPresent(entity -> {
|
||||
entity.setDictionaryEntry(true);
|
||||
entity.setDossierDictionaryEntry(isDossierDictionaryEntry);
|
||||
if (entityType.equals(EntityType.DICTIONARY_REMOVAL)) {
|
||||
entity.ignore("DICT.0.0", "Ignore Dossier Dictionary Entity with DICTIONARY_REMOVAL entity type");
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
@ -9,38 +8,27 @@ import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.common.cache.CacheLoader;
|
||||
import com.google.common.cache.LoadingCache;
|
||||
import com.iqser.red.service.dictionarymerge.commons.CommonsDictionaryModel;
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
|
||||
import com.iqser.red.service.dictionarymerge.commons.DictionaryMergeService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.configuration.Colors;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.type.Type;
|
||||
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryEntries;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryFactory;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryRepresentation;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.TenantDictionary;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import feign.FeignException;
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -50,34 +38,40 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class DictionaryService {
|
||||
|
||||
public static final String DEFAULT_COLOR = "#cccccc";
|
||||
private final DictionaryClient dictionaryClient;
|
||||
|
||||
private final RedactionServiceSettings settings;
|
||||
|
||||
private final DictionaryMergeService dictionaryMergeService;
|
||||
|
||||
private LoadingCache<String, TenantDictionary> tenantDictionaryCache;
|
||||
private final DictionaryCacheService dictionaryCacheService;
|
||||
private final DictionaryFactory dictionaryFactory;
|
||||
|
||||
|
||||
@PostConstruct
|
||||
protected void createCache() {
|
||||
@SneakyThrows
|
||||
@Observed(name = "DictionaryService", contextualName = "get-dictionary")
|
||||
@Timed("redactmanager_getDictionary")
|
||||
public Dictionary getDictionary(String dossierTemplateId, String dossierId) {
|
||||
|
||||
tenantDictionaryCache = CacheBuilder.newBuilder()
|
||||
.maximumSize(settings.getDictionaryCacheMaximumSize())
|
||||
.expireAfterAccess(settings.getDictionaryCacheExpireAfterAccessDays(), TimeUnit.DAYS)
|
||||
.build(new CacheLoader<>() {
|
||||
public TenantDictionary load(String key) {
|
||||
String tenantId = TenantContext.getTenantId();
|
||||
|
||||
return new TenantDictionary();
|
||||
}
|
||||
});
|
||||
}
|
||||
Optional<Dictionary> cachedDictionary = dictionaryCacheService.getDictionary(tenantId, dossierId);
|
||||
|
||||
if (cachedDictionary.isPresent()) {
|
||||
log.debug("Dictionary found in cache");
|
||||
boolean isUpToDate = checkIfDictionaryIsUpToDate(dossierTemplateId, dossierId, cachedDictionary.get());
|
||||
if (isUpToDate) {
|
||||
log.info("Returning cached Dictionary for tenantId: {}, dossierId: {}", tenantId, dossierId);
|
||||
return cachedDictionary.get();
|
||||
} else {
|
||||
log.debug("Cached Dictionary is outdated for tenantId: {}, dossierId: {}", tenantId, dossierId);
|
||||
}
|
||||
} else {
|
||||
log.info("No cached Dictionary found for tenantId: {}, dossierId: {}", tenantId, dossierId);
|
||||
}
|
||||
|
||||
public void clearTenantDictionaryCache() {
|
||||
DictionaryVersion latestVersion = updateDictionary(dossierTemplateId, dossierId);
|
||||
Dictionary newDictionary = buildDictionary(dossierTemplateId, dossierId, latestVersion);
|
||||
|
||||
tenantDictionaryCache.invalidateAll();
|
||||
dictionaryCacheService.putDictionary(tenantId, dossierId, newDictionary);
|
||||
log.debug("Cached new Dictionary for tenantId: {}, dossierId: {}", tenantId, dossierId);
|
||||
|
||||
return newDictionary;
|
||||
}
|
||||
|
||||
|
||||
@ -87,19 +81,24 @@ public class DictionaryService {
|
||||
public DictionaryVersion updateDictionary(String dossierTemplateId, String dossierId) {
|
||||
|
||||
log.debug("Updating dictionary data for dossierTemplate {} and dossier {}", dossierTemplateId, dossierId);
|
||||
long dossierTemplateDictionaryVersion = dictionaryClient.getVersion(dossierTemplateId);
|
||||
var dossierTemplateDictionary = getDossierTemplateDictionary(dossierTemplateId);
|
||||
if (dossierTemplateDictionary == null || dossierTemplateDictionaryVersion > dossierTemplateDictionary.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(dossierTemplateId, dossierTemplateDictionaryVersion, getVersion(dossierTemplateDictionary), null);
|
||||
|
||||
// Update template dictionary
|
||||
long latestTemplateVersion = dictionaryClient.getVersion(dossierTemplateId);
|
||||
DictionaryRepresentation templateDictRep = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId);
|
||||
|
||||
if (templateDictRep == null || latestTemplateVersion > templateDictRep.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(dossierTemplateId, latestTemplateVersion, templateDictRep != null ? templateDictRep.getDictionaryVersion() : null, null);
|
||||
}
|
||||
|
||||
long dossierDictionaryVersion = dictionaryClient.getVersionForDossier(dossierId);
|
||||
var dossierDictionary = getDossierDictionary(dossierId);
|
||||
if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(dossierTemplateId, dossierDictionaryVersion, getVersion(dossierDictionary), dossierId);
|
||||
// Update dossier dictionary
|
||||
long latestDossierVersion = dictionaryClient.getVersionForDossier(dossierId);
|
||||
DictionaryRepresentation dossierDictRep = dictionaryCacheService.getDossierDictionary(dossierId);
|
||||
|
||||
if (dossierDictRep == null || latestDossierVersion > dossierDictRep.getDictionaryVersion()) {
|
||||
updateDictionaryEntry(dossierTemplateId, latestDossierVersion, dossierDictRep != null ? dossierDictRep.getDictionaryVersion() : null, dossierId);
|
||||
}
|
||||
|
||||
return DictionaryVersion.builder().dossierTemplateVersion(dossierTemplateDictionaryVersion).dossierVersion(dossierDictionaryVersion).build();
|
||||
return DictionaryVersion.builder().dossierTemplateVersion(latestTemplateVersion).dossierVersion(latestDossierVersion).build();
|
||||
}
|
||||
|
||||
|
||||
@ -108,57 +107,21 @@ public class DictionaryService {
|
||||
@Timed("redactmanager_getDictionaryIncrements")
|
||||
public DictionaryIncrement getDictionaryIncrements(String dossierTemplateId, DictionaryVersion fromVersion, String dossierId) {
|
||||
|
||||
DictionaryVersion version = updateDictionary(dossierTemplateId, dossierId);
|
||||
DictionaryVersion latestVersion = updateDictionary(dossierTemplateId, dossierId);
|
||||
|
||||
Set<DictionaryIncrementValue> newValues = new HashSet<>();
|
||||
List<DictionaryModel> dictionaryModels = getDossierTemplateDictionary(dossierTemplateId).getDictionary();
|
||||
Set<DictionaryIncrementValue> newValues = Collections.synchronizedSet(new HashSet<>());
|
||||
List<DictionaryModel> templateDictionaries = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId).getDictionary();
|
||||
|
||||
dictionaryModels.forEach(dictionaryModel -> {
|
||||
dictionaryModel.getEntries()
|
||||
.forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
dictionaryModel.getFalsePositives()
|
||||
.forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
dictionaryModel.getFalseRecommendations()
|
||||
.forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
});
|
||||
templateDictionaries.parallelStream()
|
||||
.forEach(dictionaryModel -> dictionaryModel.addNewEntries(fromVersion.getDossierTemplateVersion(), newValues));
|
||||
|
||||
if (dossierDictionaryExists(dossierId)) {
|
||||
dictionaryModels = getDossierDictionary(dossierId).getDictionary();
|
||||
dictionaryModels.forEach(dictionaryModel -> {
|
||||
dictionaryModel.getEntries()
|
||||
.forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
dictionaryModel.getFalsePositives()
|
||||
.forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
dictionaryModel.getFalseRecommendations()
|
||||
.forEach(dictionaryEntry -> {
|
||||
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
|
||||
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
|
||||
}
|
||||
});
|
||||
});
|
||||
List<DictionaryModel> dossierDictionaries = dictionaryCacheService.getDossierDictionary(dossierId).getDictionary();
|
||||
dossierDictionaries.parallelStream()
|
||||
.forEach(dictionaryModel -> dictionaryModel.addNewEntries(fromVersion.getDossierVersion(), newValues));
|
||||
}
|
||||
|
||||
return new DictionaryIncrement(newValues, version);
|
||||
return new DictionaryIncrement(newValues, latestVersion);
|
||||
}
|
||||
|
||||
|
||||
@ -168,342 +131,164 @@ public class DictionaryService {
|
||||
try {
|
||||
DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation();
|
||||
|
||||
var typeResponse = dossierId == null ? dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId, currentVersion, true) : dictionaryClient.getAllTypesForDossier(
|
||||
dossierId,
|
||||
currentVersion,
|
||||
true);
|
||||
List<Type> typeResponse;
|
||||
if (dossierId == null) {
|
||||
typeResponse = dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId, currentVersion, true);
|
||||
} else {
|
||||
typeResponse = dictionaryClient.getAllTypesForDossier(dossierId, currentVersion, true);
|
||||
}
|
||||
|
||||
if (CollectionUtils.isNotEmpty(typeResponse)) {
|
||||
|
||||
String tenantId = TenantContext.getTenantId();
|
||||
List<DictionaryModel> dictionary = typeResponse.stream()
|
||||
.parallel()
|
||||
.map(t -> {
|
||||
|
||||
TenantContext.setTenantId(tenantId);
|
||||
Optional<DictionaryModel> optionalOldModel;
|
||||
if (dossierId == null) {
|
||||
var representation = getDossierTemplateDictionary(dossierTemplateId);
|
||||
optionalOldModel = representation != null ? representation.getDictionary()
|
||||
.stream()
|
||||
.filter(f -> f.getType().equals(t.getType()))
|
||||
.findAny() : Optional.empty();
|
||||
} else {
|
||||
var representation = getDossierDictionary(dossierId);
|
||||
optionalOldModel = representation != null ? representation.getDictionary()
|
||||
.stream()
|
||||
.filter(f -> f.getType().equals(t.getType()))
|
||||
.findAny() : Optional.empty();
|
||||
}
|
||||
|
||||
Set<DictionaryEntryModel> entries = new HashSet<>();
|
||||
Set<DictionaryEntryModel> falsePositives = new HashSet<>();
|
||||
Set<DictionaryEntryModel> falseRecommendations = new HashSet<>();
|
||||
|
||||
DictionaryEntries newEntries = mapEntries(t);
|
||||
|
||||
var newValues = newEntries.getEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
var newFalsePositivesValues = newEntries.getFalsePositives()
|
||||
.stream()
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
var newFalseRecommendationsValues = newEntries.getFalseRecommendations()
|
||||
.stream()
|
||||
.map(DictionaryEntry::getValue)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
optionalOldModel.ifPresent(oldDictionaryModel -> {
|
||||
|
||||
});
|
||||
if (optionalOldModel.isPresent()) {
|
||||
var oldModel = optionalOldModel.get();
|
||||
if (oldModel.isCaseInsensitive() && !t.isCaseInsensitive()) {
|
||||
// add old entries from existing DictionaryModel but exclude lower case representation
|
||||
entries.addAll(oldModel.getEntries()
|
||||
.stream()
|
||||
.filter(f -> !newValues.stream()
|
||||
.map(s -> s.toLowerCase(Locale.ROOT))
|
||||
.toList().contains(f.getValue()))
|
||||
.toList());
|
||||
falsePositives.addAll(oldModel.getFalsePositives()
|
||||
.stream()
|
||||
.filter(f -> !newFalsePositivesValues.stream()
|
||||
.map(s -> s.toLowerCase(Locale.ROOT))
|
||||
.toList().contains(f.getValue()))
|
||||
.toList());
|
||||
falseRecommendations.addAll(oldModel.getFalseRecommendations()
|
||||
.stream()
|
||||
.filter(f -> !newFalseRecommendationsValues.stream()
|
||||
.map(s -> s.toLowerCase(Locale.ROOT))
|
||||
.toList().contains(f.getValue()))
|
||||
.toList());
|
||||
} else if (!oldModel.isCaseInsensitive() && t.isCaseInsensitive()) {
|
||||
// add old entries from existing DictionaryModel but exclude upper case representation
|
||||
entries.addAll(oldModel.getEntries()
|
||||
.stream()
|
||||
.filter(f -> !newValues.contains(f.getValue().toLowerCase(Locale.ROOT)))
|
||||
.toList());
|
||||
falsePositives.addAll(oldModel.getFalsePositives()
|
||||
.stream()
|
||||
.filter(f -> !newFalsePositivesValues.contains(f.getValue().toLowerCase(Locale.ROOT)))
|
||||
.toList());
|
||||
falseRecommendations.addAll(oldModel.getFalseRecommendations()
|
||||
.stream()
|
||||
.filter(f -> !newFalseRecommendationsValues.contains(f.getValue().toLowerCase(Locale.ROOT)))
|
||||
.toList());
|
||||
|
||||
} else {
|
||||
// add old entries from existing DictionaryModel
|
||||
entries.addAll(oldModel.getEntries()
|
||||
.stream()
|
||||
.filter(f -> !newValues.contains(f.getValue()))
|
||||
.toList());
|
||||
falsePositives.addAll(oldModel.getFalsePositives()
|
||||
.stream()
|
||||
.filter(f -> !newFalsePositivesValues.contains(f.getValue()))
|
||||
.toList());
|
||||
falseRecommendations.addAll(oldModel.getFalseRecommendations()
|
||||
.stream()
|
||||
.filter(f -> !newFalseRecommendationsValues.contains(f.getValue()))
|
||||
.toList());
|
||||
}
|
||||
}
|
||||
|
||||
// Add Increments
|
||||
entries.addAll(newEntries.getEntries());
|
||||
falsePositives.addAll(newEntries.getFalsePositives());
|
||||
falseRecommendations.addAll(newEntries.getFalseRecommendations());
|
||||
|
||||
return new DictionaryModel(t.getType(),
|
||||
t.getRank(),
|
||||
convertColor(t.getHexColor()),
|
||||
t.isCaseInsensitive(),
|
||||
t.isHint(),
|
||||
entries,
|
||||
falsePositives,
|
||||
falseRecommendations,
|
||||
dossierId != null);
|
||||
})
|
||||
.map(t -> mapTypeToDictionaryModel(tenantId, t, dossierTemplateId, dossierId))
|
||||
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
dictionary.forEach(dm -> dictionaryRepresentation.getLocalAccessMap().put(dm.getType(), dm));
|
||||
|
||||
Colors colors = dictionaryClient.getColors(dossierTemplateId);
|
||||
|
||||
dictionaryRepresentation.setDefaultColor(convertColor(DEFAULT_COLOR));
|
||||
dictionaryRepresentation.setRequestAddColor(convertColor(colors.getRequestAddColor()));
|
||||
dictionaryRepresentation.setRequestRemoveColor(convertColor(colors.getRequestRemoveColor()));
|
||||
dictionaryRepresentation.setNotRedactedColor(convertColor(colors.getSkippedColor()));
|
||||
dictionaryRepresentation.setDossierTemplateId(dossierTemplateId);
|
||||
dictionaryRepresentation.setDictionaryVersion(version);
|
||||
dictionaryRepresentation.setDictionary(dictionary);
|
||||
|
||||
if (dossierId == null) {
|
||||
addDictionaryRepresentationForDossierTemplate(dossierTemplateId, dictionaryRepresentation);
|
||||
dictionaryCacheService.addDictionaryRepresentationForDossierTemplate(dossierTemplateId, dictionaryRepresentation);
|
||||
} else {
|
||||
addDictionaryRepresentationForDossier(dossierId, dictionaryRepresentation);
|
||||
dictionaryCacheService.addDictionaryRepresentationForDossier(dossierId, dictionaryRepresentation);
|
||||
}
|
||||
}
|
||||
} catch (FeignException e) {
|
||||
log.warn("Got some unknown feignException", e);
|
||||
log.warn("Got some unknown FeignException", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private DictionaryModel mapTypeToDictionaryModel(String tenantId, Type type, String dossierTemplateId, String dossierId) {
|
||||
|
||||
TenantContext.setTenantId(tenantId);
|
||||
Optional<DictionaryModel> optionalOldModel = getExistingDictionaryModel(type.getType(), dossierTemplateId, dossierId);
|
||||
|
||||
DictionaryEntries newEntries = mapEntries(type);
|
||||
Set<DictionaryEntryModel> combinedEntries = new HashSet<>(newEntries.getEntries());
|
||||
Set<DictionaryEntryModel> combinedFalsePositives = new HashSet<>(newEntries.getFalsePositives());
|
||||
Set<DictionaryEntryModel> combinedFalseRecommendations = new HashSet<>(newEntries.getFalseRecommendations());
|
||||
|
||||
optionalOldModel.ifPresent(oldModel -> oldModel.handleOldEntries(type, newEntries, combinedEntries, combinedFalsePositives, combinedFalseRecommendations));
|
||||
|
||||
combinedEntries.addAll(newEntries.getEntries());
|
||||
combinedFalsePositives.addAll(newEntries.getFalsePositives());
|
||||
combinedFalseRecommendations.addAll(newEntries.getFalseRecommendations());
|
||||
|
||||
return new DictionaryModel(type.getType(),
|
||||
type.getRank(),
|
||||
null,
|
||||
type.isCaseInsensitive(),
|
||||
type.isHint(),
|
||||
combinedEntries,
|
||||
combinedFalsePositives,
|
||||
combinedFalseRecommendations,
|
||||
dossierId != null);
|
||||
}
|
||||
|
||||
|
||||
private Optional<DictionaryModel> getExistingDictionaryModel(String type, String dossierTemplateId, String dossierId) {
|
||||
|
||||
DictionaryRepresentation representation = dossierId == null ? //
|
||||
dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId) : dictionaryCacheService.getDossierDictionary(dossierId);
|
||||
|
||||
return representation != null ? representation.getDictionary()
|
||||
.stream()
|
||||
.filter(f -> f.getType().equals(type))
|
||||
.findAny() : Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private DictionaryEntries mapEntries(Type type) {
|
||||
|
||||
Set<DictionaryEntryModel> entries = type.getEntries() != null ? new HashSet<>(type.getEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::new)
|
||||
.collect(Collectors.toSet())) : new HashSet<>();
|
||||
Set<DictionaryEntryModel> falsePositives = type.getFalsePositiveEntries() != null ? new HashSet<>(type.getFalsePositiveEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::new)
|
||||
.collect(Collectors.toSet())) : new HashSet<>();
|
||||
Set<DictionaryEntryModel> falseRecommendations = type.getFalseRecommendationEntries() != null ? new HashSet<>(type.getFalseRecommendationEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::new)
|
||||
.collect(Collectors.toSet())) : new HashSet<>();
|
||||
Set<DictionaryEntryModel> entries = type.getEntries() != null ? type.getEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::new)
|
||||
.collect(Collectors.toSet()) : new HashSet<>();
|
||||
|
||||
Set<DictionaryEntryModel> falsePositives = type.getFalsePositiveEntries() != null ? type.getFalsePositiveEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::new)
|
||||
.collect(Collectors.toSet()) : new HashSet<>();
|
||||
|
||||
Set<DictionaryEntryModel> falseRecommendations = type.getFalseRecommendationEntries() != null ? type.getFalseRecommendationEntries()
|
||||
.stream()
|
||||
.map(DictionaryEntryModel::new)
|
||||
.collect(Collectors.toSet()) : new HashSet<>();
|
||||
|
||||
if (type.isCaseInsensitive()) {
|
||||
entries.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
|
||||
falsePositives.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
|
||||
falseRecommendations.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
|
||||
}
|
||||
log.debug("Dictionary update returned {} entries {} falsePositives and {} falseRecommendations for type {}",
|
||||
|
||||
log.debug("Dictionary update returned {} entries, {} falsePositives, and {} falseRecommendations for type {}",
|
||||
entries.size(),
|
||||
falsePositives.size(),
|
||||
falseRecommendations.size(),
|
||||
entries);
|
||||
type.getType());
|
||||
|
||||
return new DictionaryEntries(entries, falsePositives, falseRecommendations);
|
||||
}
|
||||
|
||||
|
||||
private float[] convertColor(String hex) {
|
||||
|
||||
Color color = Color.decode(hex);
|
||||
return new float[]{color.getRed() / 255f, color.getGreen() / 255f, color.getBlue() / 255f};
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public float[] getColor(String type, String dossierTemplateId) {
|
||||
|
||||
DictionaryModel model = getDossierTemplateDictionary(dossierTemplateId).getLocalAccessMap()
|
||||
.get(type);
|
||||
if (model != null) {
|
||||
return model.getColor();
|
||||
}
|
||||
return getDossierTemplateDictionary(dossierTemplateId).getDefaultColor();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public boolean isHint(String type, String dossierTemplateId) {
|
||||
|
||||
DictionaryModel model = getDossierTemplateDictionary(dossierTemplateId).getLocalAccessMap()
|
||||
DictionaryModel model = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId).getLocalAccessMap()
|
||||
.get(type);
|
||||
if (model != null) {
|
||||
return model.isHint();
|
||||
}
|
||||
return false;
|
||||
return model != null && model.isHint();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Timed("redactmanager_getDeepCopyDictionary")
|
||||
@Observed(name = "DictionaryService", contextualName = "deep-copy-dictionary")
|
||||
public Dictionary getDeepCopyDictionary(String dossierTemplateId, String dossierId) {
|
||||
private Dictionary buildDictionary(String dossierTemplateId, String dossierId, DictionaryVersion dictionaryVersion) {
|
||||
|
||||
List<DictionaryModel> mergedDictionaries = new LinkedList<>();
|
||||
|
||||
DictionaryRepresentation dossierTemplateRepresentation = getDossierTemplateDictionary(dossierTemplateId);
|
||||
List<DictionaryModel> dossierTemplateDictionaries = dossierTemplateRepresentation.getDictionary();
|
||||
dossierTemplateDictionaries.forEach(dm -> mergedDictionaries.add(SerializationUtils.clone(dm)));
|
||||
|
||||
// add dossier
|
||||
long dossierDictionaryVersion = -1;
|
||||
if (dossierDictionaryExists(dossierId)) {
|
||||
DictionaryRepresentation dossierRepresentation = getDossierDictionary(dossierId);
|
||||
List<DictionaryModel> dossierDictionaries = dossierRepresentation.getDictionary();
|
||||
dossierDictionaries.forEach(dm -> mergedDictionaries.add(SerializationUtils.clone(dm)));
|
||||
return getDictionary(mergedDictionaries, dossierTemplateRepresentation, dossierRepresentation.getDictionaryVersion());
|
||||
} else {
|
||||
return getDictionary(mergedDictionaries, dossierTemplateRepresentation, dossierDictionaryVersion);
|
||||
DictionaryRepresentation templateDictRep = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId);
|
||||
if (templateDictRep != null) {
|
||||
templateDictRep.getDictionary()
|
||||
.forEach(dm -> mergedDictionaries.add(dm.clone()));
|
||||
}
|
||||
|
||||
if (dossierDictionaryExists(dossierId)) {
|
||||
DictionaryRepresentation dossierDictRep = dictionaryCacheService.getDossierDictionary(dossierId);
|
||||
if (dossierDictRep != null) {
|
||||
dossierDictRep.getDictionary()
|
||||
.forEach(dm -> mergedDictionaries.add(dm.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
return dictionaryFactory.create(mergedDictionaries.stream()
|
||||
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
|
||||
.collect(Collectors.toList()), dictionaryVersion);
|
||||
}
|
||||
|
||||
|
||||
private Dictionary getDictionary(List<DictionaryModel> mergedDictionaries, DictionaryRepresentation dossierTemplateRepresentation, long dossierDictionaryVersion) {
|
||||
private boolean checkIfDictionaryIsUpToDate(String dossierTemplateId, String dossierId, Dictionary cachedDictionary) {
|
||||
|
||||
return new Dictionary(mergedDictionaries.stream()
|
||||
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
|
||||
.collect(Collectors.toList()),
|
||||
DictionaryVersion.builder()
|
||||
.dossierTemplateVersion(dossierTemplateRepresentation.getDictionaryVersion())
|
||||
.dossierVersion(dossierDictionaryVersion)
|
||||
.build());
|
||||
}
|
||||
long latestTemplateVersion = dictionaryClient.getVersion(dossierTemplateId);
|
||||
long latestDossierVersion = dictionaryClient.getVersionForDossier(dossierId);
|
||||
|
||||
DictionaryVersion cachedVersion = cachedDictionary.getVersion();
|
||||
|
||||
@SneakyThrows
|
||||
public float[] getNotRedactedColor(String dossierTemplateId) {
|
||||
|
||||
return getDossierTemplateDictionary(dossierTemplateId).getNotRedactedColor();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void addDictionaryRepresentationForDossierTemplate(String dossierTemplateId, DictionaryRepresentation dictionaryRepresentation) {
|
||||
|
||||
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate().put(dossierTemplateId, dictionaryRepresentation);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void addDictionaryRepresentationForDossier(String dossierId, DictionaryRepresentation dictionaryRepresentation) {
|
||||
|
||||
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier().put(dossierId, dictionaryRepresentation);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private DictionaryRepresentation getDossierTemplateDictionary(String dossierTemplateId) {
|
||||
|
||||
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate()
|
||||
.get(dossierTemplateId);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private DictionaryRepresentation getDossierDictionary(String dossierId) {
|
||||
|
||||
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier()
|
||||
.get(dossierId);
|
||||
return (cachedVersion.getDossierTemplateVersion() >= latestTemplateVersion) && (cachedVersion.getDossierVersion() >= latestDossierVersion);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean dossierDictionaryExists(String dossierId) {
|
||||
|
||||
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier().containsKey(dossierId);
|
||||
}
|
||||
|
||||
|
||||
private Long getVersion(DictionaryRepresentation dictionaryRepresentation) {
|
||||
|
||||
if (dictionaryRepresentation == null) {
|
||||
return null;
|
||||
} else {
|
||||
return dictionaryRepresentation.getDictionaryVersion();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<CommonsDictionaryModel> convertDictionaryModel(List<DictionaryModel> dictionaries) {
|
||||
|
||||
return dictionaries.stream()
|
||||
.map(d -> CommonsDictionaryModel.builder()
|
||||
.type(d.getType())
|
||||
.rank(d.getRank())
|
||||
.color(d.getColor())
|
||||
.caseInsensitive(d.isCaseInsensitive())
|
||||
.hint(d.isHint())
|
||||
.isDossierDictionary(d.isDossierDictionary())
|
||||
.entries(d.getEntries())
|
||||
.falsePositives(d.getFalsePositives())
|
||||
.falseRecommendations(d.getFalseRecommendations())
|
||||
.build())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private List<DictionaryModel> convertCommonsDictionaryModel(List<CommonsDictionaryModel> commonsDictionaries) {
|
||||
|
||||
return commonsDictionaries.stream()
|
||||
.map(cd -> new DictionaryModel(cd.getType(),
|
||||
cd.getRank(),
|
||||
cd.getColor(),
|
||||
cd.isCaseInsensitive(),
|
||||
cd.isHint(),
|
||||
cd.getEntries(),
|
||||
cd.getFalsePositives(),
|
||||
cd.getFalseRecommendations(),
|
||||
cd.isDossierDictionary()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public List<Type> getAllTypes(String dossierTemplateId, String dossierId) {
|
||||
|
||||
List<Type> allTypes = dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId, null, false);
|
||||
allTypes.addAll(dictionaryClient.getAllTypesForDossier(dossierId, null, false));
|
||||
return allTypes;
|
||||
DictionaryRepresentation dossierDictRep = dictionaryCacheService.getDossierDictionary(dossierId);
|
||||
return dossierDictRep != null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -265,7 +265,7 @@ public class RedactionStorageService {
|
||||
// And the cache eviction logic when a file changes after e.g. ocr is not implemented yet.
|
||||
// See https://knecon.atlassian.net/jira/software/c/projects/RED/boards/37?selectedIssue=RED-8106.
|
||||
@Timed("redactmanager_getDocumentGraph")
|
||||
@Cacheable(value = "documentDataCache")
|
||||
//@Cacheable(value = "documentDataCache")
|
||||
public DocumentData getDocumentData(String dossierId, String fileId) {
|
||||
|
||||
try {
|
||||
|
||||
@ -67,6 +67,7 @@ import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.service.AnalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DictionaryCacheService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DocumentSearchService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.UnprocessedChangesService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.websocket.RedisSyncedWebSocketService;
|
||||
@ -207,6 +208,10 @@ public abstract class AbstractRedactionIntegrationTest {
|
||||
@Autowired
|
||||
protected TenantMongoLiquibaseExecutor tenantMongoLiquibaseExecutor;
|
||||
|
||||
@Autowired
|
||||
DictionaryCacheService dictionaryCacheService;
|
||||
|
||||
|
||||
protected final Map<String, List<String>> dictionary = new HashMap<>();
|
||||
protected final Map<String, List<String>> dossierDictionary = new HashMap<>();
|
||||
protected final Map<String, List<String>> falsePositive = new HashMap<>();
|
||||
@ -271,6 +276,7 @@ public abstract class AbstractRedactionIntegrationTest {
|
||||
}
|
||||
entityLogDocumentRepository.deleteAll();
|
||||
entityLogEntryDocumentRepository.deleteAll();
|
||||
dictionaryCacheService.clearAllCaches();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -62,6 +62,7 @@ import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryFactory;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
|
||||
@ -148,6 +149,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
private MongoConnectionProvider mongoConnectionProvider;
|
||||
@MockBean
|
||||
private TenantProvider tenantProvider;
|
||||
@Autowired
|
||||
private DictionaryFactory dictionaryFactory;
|
||||
|
||||
|
||||
@Test
|
||||
@ -250,17 +253,12 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse);
|
||||
when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0));
|
||||
when(dictionaryService.getDeepCopyDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary);
|
||||
when(dictionaryService.getDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary);
|
||||
when(dictionaryService.getDictionaryIncrements(any(), any(), any())).thenReturn(new DictionaryIncrement(Collections.emptySet(), new DictionaryVersion(0, 0)));
|
||||
when(dictionaryService.isHint(any(String.class), any())).thenAnswer(invocation -> {
|
||||
String type = invocation.getArgument(0);
|
||||
return testDossierTemplate.testDictionary.isHint(type);
|
||||
});
|
||||
when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> {
|
||||
String type = invocation.getArgument(0);
|
||||
return testDossierTemplate.testDictionary.getType(type).getColor();
|
||||
});
|
||||
when(dictionaryService.getNotRedactedColor(any())).thenReturn(new float[]{0.2f, 0.2f, 0.2f});
|
||||
|
||||
when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis());
|
||||
when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(testDossierTemplate.rules));
|
||||
@ -422,7 +420,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
componentRules = new String(Files.readAllBytes(componentRuleFile.toPath()));
|
||||
}
|
||||
|
||||
testDictionary = new Dictionary(dictionaries, new DictionaryVersion(0, 0));
|
||||
testDictionary = dictionaryFactory.create(dictionaries, new DictionaryVersion(0, 0));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -225,14 +225,14 @@ public class DictionaryServiceTest {
|
||||
when(dictionaryClient.getDictionaryForType("dossierType", 0L)).thenReturn(dossierType);
|
||||
|
||||
dictionaryService.updateDictionary("dtId", "dossierId");
|
||||
var dict = dictionaryService.getDeepCopyDictionary("dtId", "dossierId");
|
||||
assertThat(dict.getDictionaryModels().size()).isEqualTo(2);
|
||||
var dict = dictionaryService.getDictionary("dtId", "dossierId");
|
||||
assertThat(dict.getDictionaryModels().size()).isEqualTo(1);
|
||||
var dictModel = dict.getDictionaryModels()
|
||||
.get(0);
|
||||
assertThat(dictModel.getType()).isEqualTo(type);
|
||||
assertThat(dictModel.getEntries().size()).isEqualTo(3);
|
||||
dictModel.getEntries()
|
||||
.forEach(entry -> assertThat(entry.getTypeId()).isEqualTo(dtType.getTypeId()));
|
||||
.forEach(entry -> assertThat(entry.getTypeId()).isEqualTo("dossierType"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -38,6 +38,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemp
|
||||
import com.iqser.red.service.redaction.v1.server.annotate.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.server.annotate.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DictionaryCacheService;
|
||||
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
@ -54,6 +55,9 @@ public class RedactionAcceptanceTest extends AbstractRedactionIntegrationTest {
|
||||
@Autowired
|
||||
DictionaryService dictionaryService;
|
||||
|
||||
@Autowired
|
||||
DictionaryCacheService dictionaryCacheService;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void stubClients() {
|
||||
@ -105,7 +109,7 @@ public class RedactionAcceptanceTest extends AbstractRedactionIntegrationTest {
|
||||
String EFSA_SANITISATION_RULES = loadFromClassPath("drools/efsa_sanitisation.drl");
|
||||
when(rulesClient.getRules(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(EFSA_SANITISATION_RULES));
|
||||
dossierDictionary.put(PUBLISHED_INFORMATION_INDICATOR, new ArrayList<>());
|
||||
dictionaryService.clearTenantDictionaryCache();
|
||||
dictionaryCacheService.clearAllCaches();
|
||||
AnalyzeRequest request = uploadFileToStorage("files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf");
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
|
||||
|
||||
@ -0,0 +1,253 @@
|
||||
package com.iqser.red.service.redaction.v1.server.document.graph;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.*;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
||||
|
||||
public class DictionarySearchImplementationsTest {
|
||||
|
||||
private static final int LARGE_TEXT_REPETITIONS = 50_000;
|
||||
private static final int MAX_PII_ENTRY_COUNT = 500_000;
|
||||
private static final String LARGE_TEXT_SAMPLE = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
|
||||
+ "Entity_1 match text. Recommendation_1 also here. Random text continues. ";
|
||||
|
||||
// Dictionary identifiers
|
||||
protected static final String VERTEBRATE_INDICATOR = "vertebrate";
|
||||
protected static final String DICTIONARY_ADDRESS = "CBI_address";
|
||||
protected static final String DICTIONARY_AUTHOR = "CBI_author";
|
||||
protected static final String DICTIONARY_SPONSOR = "CBI_sponsor";
|
||||
protected static final String DICTIONARY_PII = "PII";
|
||||
protected static final String NO_REDACTION_INDICATOR = "no_redaction_indicator";
|
||||
protected static final String REDACTION_INDICATOR = "redaction_indicator";
|
||||
protected static final String HINT_ONLY_INDICATOR = "hint_only";
|
||||
protected static final String MUST_REDACT_INDICATOR = "must_redact";
|
||||
protected static final String PUBLISHED_INFORMATION_INDICATOR = "published_information";
|
||||
protected static final String TEST_METHOD_INDICATOR = "test_method";
|
||||
protected static final String PURITY_INDICATOR = "purity";
|
||||
|
||||
|
||||
@Test
|
||||
public void performanceTestWithRealDictionaries() {
|
||||
|
||||
// Load dictionaries from files
|
||||
Map<String, List<String>> loadedDictionaries = loadDictionaries();
|
||||
|
||||
Map<DictionaryIdentifier, List<String>> dictionaryValues = new HashMap<>();
|
||||
Random random = new Random();
|
||||
|
||||
// Randomly assign EntityType.ENTITY or EntityType.RECOMMENDATION to dictionaries
|
||||
for (Map.Entry<String, List<String>> entry : loadedDictionaries.entrySet()) {
|
||||
String dictionaryName = entry.getKey();
|
||||
List<String> dictionaryTerms = entry.getValue();
|
||||
|
||||
EntityType entityType = random.nextBoolean() ? EntityType.ENTITY : EntityType.RECOMMENDATION;
|
||||
boolean caseSensitive = random.nextBoolean();
|
||||
|
||||
DictionaryIdentifier identifier = new DictionaryIdentifier(dictionaryName, entityType, true, caseSensitive);
|
||||
|
||||
dictionaryValues.put(identifier, dictionaryTerms);
|
||||
}
|
||||
|
||||
// **Added dummy dictionaries as per request**
|
||||
// Case-sensitive dictionary containing "Entity_1"
|
||||
DictionaryIdentifier entity1Identifier = new DictionaryIdentifier("dummy_case_sensitive", EntityType.ENTITY, true, true // Case-sensitive
|
||||
);
|
||||
dictionaryValues.put(entity1Identifier, List.of("Entity_1"));
|
||||
|
||||
// Case-insensitive dictionary containing "recommendation_1"
|
||||
DictionaryIdentifier recommendation1Identifier = new DictionaryIdentifier("dummy_case_insensitive", EntityType.RECOMMENDATION, true, false // Case-insensitive
|
||||
);
|
||||
dictionaryValues.put(recommendation1Identifier, List.of("recommendation_1"));
|
||||
|
||||
// Measure construction time for TrieDictionarySearch
|
||||
long trieDictionaryConstructionStart = System.currentTimeMillis();
|
||||
DoubleTrieDictionarySearch doubleTrieDictionarySearchImpl = new DoubleTrieDictionarySearch(dictionaryValues);
|
||||
long trieDictionaryConstructionDuration = System.currentTimeMillis() - trieDictionaryConstructionStart;
|
||||
|
||||
// Measure construction time for AnotherTrieDictionarySearch
|
||||
long anotherTrieConstructionStart = System.currentTimeMillis();
|
||||
Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap = computeStringIdentifiersMap(dictionaryValues);
|
||||
AhoCorasickMapDictionarySearch ahoCorasickMapDictionarySearchImpl = new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
|
||||
long anotherTrieConstructionDuration = System.currentTimeMillis() - anotherTrieConstructionStart;
|
||||
|
||||
// Measure construction time for SearchImplementations
|
||||
long searchTrieConstructionStart = System.currentTimeMillis();
|
||||
List<SearchImplementation> searchImplementations = dictionaryValues.entrySet()
|
||||
.stream()
|
||||
.map(entry -> new SearchImplementation(entry.getValue(), !entry.getKey().caseSensitive()))
|
||||
.toList();
|
||||
long searchTrieConstructionDuration = System.currentTimeMillis() - searchTrieConstructionStart;
|
||||
|
||||
// Measure construction time for DoubleArrayTrieDictionarySearch
|
||||
long doubleArrayTrieConstructionStart = System.currentTimeMillis();
|
||||
DoubleArrayTrieDictionarySearch doubleArrayTrieSearchImpl = new DoubleArrayTrieDictionarySearch(keyWordToIdentifiersMap);
|
||||
long doubleArrayTrieConstructionDuration = System.currentTimeMillis() - doubleArrayTrieConstructionStart;
|
||||
|
||||
String largeText = LARGE_TEXT_SAMPLE.repeat(LARGE_TEXT_REPETITIONS);
|
||||
|
||||
// Measure search time for TrieDictionarySearch
|
||||
long trieDictionarySearchStart = System.currentTimeMillis();
|
||||
List<DoubleTrieDictionarySearch.MatchTextRange> trieDictionaryMatches = doubleTrieDictionarySearchImpl.getBoundariesAsList(largeText);
|
||||
long trieDictionarySearchDuration = System.currentTimeMillis() - trieDictionarySearchStart;
|
||||
|
||||
// Measure search time for AnotherTrieDictionarySearch
|
||||
long anotherTrieSearchStart = System.currentTimeMillis();
|
||||
List<DictionarySearch.MatchTextRange> anotherTrieMatches = ahoCorasickMapDictionarySearchImpl.getBoundaries(largeText)
|
||||
.toList();
|
||||
long anotherTrieSearchDuration = System.currentTimeMillis() - anotherTrieSearchStart;
|
||||
|
||||
// Measure search time for SearchImplementations
|
||||
long searchImplStart = System.currentTimeMillis();
|
||||
List<TextRange> searchMatches = new ArrayList<>();
|
||||
for (SearchImplementation searchImpl : searchImplementations) {
|
||||
searchMatches.addAll(searchImpl.getBoundaries(largeText));
|
||||
}
|
||||
long searchImplDuration = System.currentTimeMillis() - searchImplStart;
|
||||
|
||||
// Measure search time for DoubleArrayTrieDictionarySearch
|
||||
long doubleArrayTrieSearchStart = System.currentTimeMillis();
|
||||
List<DictionarySearch.MatchTextRange> doubleArrayTrieMatches = doubleArrayTrieSearchImpl.getBoundariesAsList(largeText);
|
||||
long doubleArrayTrieSearchDuration = System.currentTimeMillis() - doubleArrayTrieSearchStart;
|
||||
|
||||
// Output the performance results
|
||||
System.out.println("\nTotal number of keywords is: " + keyWordToIdentifiersMap.size());
|
||||
|
||||
System.out.printf("DoubleTrieDictionarySearch construction took %d ms%n", trieDictionaryConstructionDuration);
|
||||
System.out.printf("DoubleTrieDictionarySearch search took %d ms and found %d matches%n", trieDictionarySearchDuration, trieDictionaryMatches.size());
|
||||
System.out.println();
|
||||
|
||||
System.out.printf("AhoCorasickMapDictionarySearch construction took %d ms%n", anotherTrieConstructionDuration);
|
||||
System.out.printf("AhoCorasickMapDictionarySearch search took %d ms and found %d matches%n", anotherTrieSearchDuration, anotherTrieMatches.size());
|
||||
System.out.println();
|
||||
|
||||
System.out.printf("Multiple Tries construction took %d ms%n", searchTrieConstructionDuration);
|
||||
System.out.printf("Combined SearchImplementations search took %d ms and found %d matches%n", searchImplDuration, searchMatches.size());
|
||||
System.out.println();
|
||||
|
||||
System.out.printf("DoubleArrayTrieDictionarySearch construction took %d ms%n", doubleArrayTrieConstructionDuration);
|
||||
System.out.printf("DoubleArrayTrieDictionarySearch search took %d ms and found %d matches%n", doubleArrayTrieSearchDuration, doubleArrayTrieMatches.size());
|
||||
System.out.println();
|
||||
|
||||
// Assert that all implementations found matches
|
||||
assert !trieDictionaryMatches.isEmpty()
|
||||
&& !anotherTrieMatches.isEmpty()
|
||||
&& !searchMatches.isEmpty()
|
||||
&& !doubleArrayTrieMatches.isEmpty() : "All implementations should find entities.";
|
||||
|
||||
// Ensure all implementations found the same number of matches
|
||||
int expectedMatches = trieDictionaryMatches.size();
|
||||
assertEquals(expectedMatches, anotherTrieMatches.size(), "Mismatch between DoubleTrieDictionarySearch and AhoCorasickMapDictionarySearch");
|
||||
assertEquals(expectedMatches, searchMatches.size(), "Mismatch between DoubleTrieDictionarySearch and Combined SearchImplementations");
|
||||
assertEquals(expectedMatches, doubleArrayTrieMatches.size(), "Mismatch between DoubleTrieDictionarySearch and DoubleArrayTrieDictionarySearch");
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<String>> loadDictionaries() {
|
||||
|
||||
Map<String, List<String>> dictionaries = new HashMap<>();
|
||||
|
||||
dictionaries.put(DICTIONARY_AUTHOR, loadDictionaryFromFile("dictionaries/CBI_author.txt"));
|
||||
dictionaries.put(DICTIONARY_SPONSOR, loadDictionaryFromFile("dictionaries/CBI_sponsor.txt"));
|
||||
dictionaries.put(VERTEBRATE_INDICATOR, loadDictionaryFromFile("dictionaries/vertebrate.txt"));
|
||||
dictionaries.put(DICTIONARY_ADDRESS, loadDictionaryFromFile("dictionaries/CBI_address.txt"));
|
||||
dictionaries.put(NO_REDACTION_INDICATOR, loadDictionaryFromFile("dictionaries/no_redaction_indicator.txt"));
|
||||
dictionaries.put(REDACTION_INDICATOR, loadDictionaryFromFile("dictionaries/redaction_indicator.txt"));
|
||||
dictionaries.put(HINT_ONLY_INDICATOR, loadDictionaryFromFile("dictionaries/hint_only.txt"));
|
||||
dictionaries.put(MUST_REDACT_INDICATOR, loadDictionaryFromFile("dictionaries/must_redact.txt"));
|
||||
dictionaries.put(PUBLISHED_INFORMATION_INDICATOR, loadDictionaryFromFile("dictionaries/published_information.txt"));
|
||||
dictionaries.put(TEST_METHOD_INDICATOR, loadDictionaryFromFile("dictionaries/test_method.txt"));
|
||||
List<String> piis = loadDictionaryFromFile("dictionaries/PII_large.txt");
|
||||
dictionaries.put(DICTIONARY_PII, MAX_PII_ENTRY_COUNT < piis.size() ? piis.subList(0, MAX_PII_ENTRY_COUNT) : piis);
|
||||
dictionaries.put(PURITY_INDICATOR, loadDictionaryFromFile("dictionaries/purity.txt"));
|
||||
|
||||
return dictionaries;
|
||||
}
|
||||
|
||||
|
||||
private List<String> loadDictionaryFromFile(String filePath) {
|
||||
|
||||
List<String> terms = new ArrayList<>();
|
||||
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(Thread.currentThread()
|
||||
.getContextClassLoader()
|
||||
.getResourceAsStream(filePath))))) {
|
||||
|
||||
terms = reader.lines()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println("Failed to load dictionary from " + filePath + ": " + e.getMessage());
|
||||
}
|
||||
|
||||
return terms;
|
||||
}
|
||||
|
||||
|
||||
private String cleanDictionaryEntry(String entry) {
|
||||
|
||||
return entry.trim();
|
||||
}
|
||||
|
||||
|
||||
private static Map<String, List<DictionaryIdentifierWithKeyword>> computeStringIdentifiersMap(Map<DictionaryIdentifier, List<String>> dictionaryValues) {
|
||||
|
||||
Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap = new HashMap<>();
|
||||
for (Map.Entry<DictionaryIdentifier, List<String>> entry : dictionaryValues.entrySet()) {
|
||||
DictionaryIdentifier identifier = entry.getKey();
|
||||
List<String> values = entry.getValue();
|
||||
for (String value : values) {
|
||||
DictionaryIdentifierWithKeyword idWithKeyword = new DictionaryIdentifierWithKeyword(identifier, value);
|
||||
stringToIdentifiersMap.computeIfAbsent(value.toLowerCase(Locale.ROOT), k -> new ArrayList<>()).add(idWithKeyword);
|
||||
}
|
||||
}
|
||||
return stringToIdentifiersMap;
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMultiplePayloads() {
|
||||
|
||||
DoubleTrieDictionarySearch dictionarySearchImpl = new DoubleTrieDictionarySearch(Map.of(new DictionaryIdentifier("type1", EntityType.ENTITY, false, false),
|
||||
List.of("apple", "banana"),
|
||||
new DictionaryIdentifier("type2", EntityType.RECOMMENDATION, false, false),
|
||||
List.of("apple", "orange"),
|
||||
new DictionaryIdentifier("type3", EntityType.FALSE_POSITIVE, false, false),
|
||||
List.of("apple", "kiwi")));
|
||||
|
||||
List<DoubleTrieDictionarySearch.MatchTextRange> dictionaryMatches = dictionarySearchImpl.getBoundariesAsList(
|
||||
"an apple is delicious, a banana and a kiwi as well. orange is a color.");
|
||||
|
||||
assertEquals(dictionaryMatches.size(), 6);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoubleArrayTrie() {
|
||||
|
||||
Map<String, List<String>> map = new HashMap<>();
|
||||
String[] keyArray = new String[]{"hers", "his", "she", "he"};
|
||||
for (String key : keyArray) {
|
||||
map.put(key, List.of(key, key, key));
|
||||
}
|
||||
|
||||
AhoCorasickDoubleArrayTrie<List<String>> acdat = new AhoCorasickDoubleArrayTrie<>();
|
||||
acdat.build(map);
|
||||
|
||||
final String text = "uhers";
|
||||
List<AhoCorasickDoubleArrayTrie.Hit<List<String>>> wordList = acdat.parseText(text);
|
||||
assertEquals(wordList.size(), 2);
|
||||
}
|
||||
|
||||
}
|
||||
@ -28,9 +28,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSON
|
||||
import com.iqser.red.service.redaction.v1.server.logger.Context;
|
||||
import com.iqser.red.service.redaction.v1.server.model.NerEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionarySearch;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
@ -99,15 +97,11 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
|
||||
Document document = buildGraph(filename);
|
||||
|
||||
dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
Dictionary dictionary = dictionaryService.getDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
|
||||
long dictionarySearchStart = System.currentTimeMillis();
|
||||
List<TextEntity> foundEntities = new LinkedList<>();
|
||||
for (DictionaryModel model : dictionary.getDictionaryModels()) {
|
||||
findEntitiesWithSearchImplementation(document, model.getEntriesSearch(), EntityType.ENTITY, foundEntities, model.getType());
|
||||
findEntitiesWithSearchImplementation(document, model.getFalsePositiveSearch(), EntityType.FALSE_POSITIVE, foundEntities, model.getType());
|
||||
findEntitiesWithSearchImplementation(document, model.getFalseRecommendationsSearch(), EntityType.FALSE_RECOMMENDATION, foundEntities, model.getType());
|
||||
}
|
||||
findEntitiesWithSearchImplementation(document, dictionary.getDictionarySearch(), foundEntities);
|
||||
System.out.printf("Dictionary search took %d ms and found %d entities\n", System.currentTimeMillis() - dictionarySearchStart, foundEntities.size());
|
||||
|
||||
long graphInsertionStart = System.currentTimeMillis();
|
||||
@ -174,7 +168,7 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
|
||||
Document document = buildGraph(filename);
|
||||
|
||||
dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
Dictionary dictionary = dictionaryService.getDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
|
||||
|
||||
int numberOfRuns = 1;
|
||||
float totalSearchTime = 0;
|
||||
@ -190,11 +184,7 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
|
||||
totalGraphTime += graphTime;
|
||||
|
||||
var searchStart = System.currentTimeMillis();
|
||||
for (var model : dictionary.getDictionaryModels()) {
|
||||
findEntitiesWithSearchImplementation(document, model.getEntriesSearch(), EntityType.ENTITY, foundEntities, model.getType());
|
||||
findEntitiesWithSearchImplementation(document, model.getFalsePositiveSearch(), EntityType.FALSE_POSITIVE, foundEntities, model.getType());
|
||||
findEntitiesWithSearchImplementation(document, model.getFalseRecommendationsSearch(), EntityType.FALSE_RECOMMENDATION, foundEntities, model.getType());
|
||||
}
|
||||
findEntitiesWithSearchImplementation(document, dictionary.getDictionarySearch(), foundEntities);
|
||||
var searchTime = System.currentTimeMillis() - searchStart;
|
||||
totalSearchTime += searchTime;
|
||||
|
||||
@ -272,16 +262,12 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
private void findEntitiesWithSearchImplementation(Document document,
|
||||
SearchImplementation searchImplementation,
|
||||
EntityType entityType,
|
||||
List<TextEntity> foundEntities,
|
||||
String type) {
|
||||
private void findEntitiesWithSearchImplementation(Document document, DictionarySearch dictionarySearch, List<TextEntity> foundEntities) {
|
||||
|
||||
TextBlock textBlock = document.getTextBlock();
|
||||
searchImplementation.getBoundaries(textBlock)
|
||||
.filter(boundary -> boundaryIsSurroundedBySeparators(textBlock, boundary))
|
||||
.map(bounds -> TextEntity.initialEntityNode(bounds, type, entityType, document))
|
||||
dictionarySearch.getBoundaries(textBlock)
|
||||
.filter(match -> boundaryIsSurroundedBySeparators(textBlock, match.textRange()))
|
||||
.map(match -> TextEntity.initialEntityNode(match.textRange(), match.identifier().type(), match.identifier().entityType(), document))
|
||||
.forEach(foundEntities::add);
|
||||
}
|
||||
|
||||
|
||||
@ -59,7 +59,6 @@ public class PrecursorEntityTest extends BuildDocumentIntegrationTest {
|
||||
public void stubMethods() {
|
||||
|
||||
MockitoAnnotations.openMocks(this);
|
||||
when(dictionaryService.getColor(DICTIONARY_AUTHOR, TEST_DOSSIER_TEMPLATE_ID)).thenReturn(new float[]{0f, 0f, 0f});
|
||||
when(dictionaryService.isHint(any(), any())).thenReturn(false);
|
||||
}
|
||||
|
||||
|
||||
@ -218,7 +218,7 @@ public class LiveDataIntegrationTest {
|
||||
|
||||
dictionaryService.updateDictionary("dossierTemplateId", "dossierId");
|
||||
|
||||
var dict = dictionaryService.getDeepCopyDictionary("dossierTemplateId", "dossierId");
|
||||
var dict = dictionaryService.getDictionary("dossierTemplateId", "dossierId");
|
||||
assertThat(dict.getLocalAccessMap().size()).isEqualTo(12);
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user