Merge branch 'feature/RED-10290' into 'master'

RED-10290: Improve SearchImplementation logic for dictionaries

Closes RED-10290

See merge request redactmanager/redaction-service!553
This commit is contained in:
Maverick Studer 2024-11-11 12:10:58 +01:00
commit 41f824297c
28 changed files with 123240 additions and 604 deletions

View File

@ -61,7 +61,9 @@ dependencies {
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.ahocorasick:ahocorasick:0.6.3")
implementation("org.ahocorasick:ahocorasick:0.9.0")
implementation("com.hankcs:aho-corasick-double-array-trie:1.2.2")
implementation("com.github.roklenarcic:aho-corasick:1.2")
implementation("org.javassist:javassist:3.29.2-GA")
implementation("org.drools:drools-engine:${droolsVersion}")

View File

@ -28,6 +28,8 @@ public class RedactionServiceSettings {
private boolean priorityMode;
private long firstLevelDictionaryCacheMaximumSize = 1000;
private long dictionaryCacheMaximumSize = 100;
private int dictionaryCacheExpireAfterAccessDays = 3;

View File

@ -0,0 +1,130 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.*;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
public abstract class AbstractDictionarySearch implements DictionarySearch {
protected final Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap;
public AbstractDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
this.keyWordToIdentifiersMap = keyWordToIdentifiersMap;
}
@Override
public Stream<MatchTextRange> getBoundaries(CharSequence text) {
TextContext textContext = new TextContext(text);
return getMatchTextRangeStream(textContext);
}
@Override
public Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region) {
CharSequence subText = text.subSequence(region.start(), region.end());
TextContext textContext = new TextContext(subText, region.start());
return getMatchTextRangeStream(textContext);
}
@Override
public Stream<MatchTextRange> getBoundaries(TextBlock textBlock) {
return getBoundaries(textBlock, textBlock.getTextRange());
}
@Override
public Stream<MatchPosition> getMatches(String text) {
TextContext textContext = new TextContext(text);
List<MatchPosition> matches = new ArrayList<>();
parseText(textContext.getLowerText(), (begin, end, value) -> addMatchPositionsForHit(textContext, matches, new Hit(begin, end, value)));
return matches.stream();
}
private Stream<MatchTextRange> getMatchTextRangeStream(TextContext textContext) {
List<MatchTextRange> matches = new ArrayList<>();
parseText(textContext.getLowerText(), (begin, end, value) -> addMatchesForHit(textContext, matches, new Hit(begin, end, value)));
return matches.stream();
}
protected abstract void parseText(CharSequence text, HitHandler handler);
protected void addMatchesForHit(TextContext textContext, List<MatchTextRange> matches, Hit hit) {
int start = textContext.getStart(hit.begin);
int end = textContext.getEnd(hit.end);
String matchedText = textContext.getMatchedText(hit.begin, hit.end);
List<DictionaryIdentifierWithKeyword> idWithKeywords = hit.value;
for (DictionaryIdentifierWithKeyword idkw : idWithKeywords) {
if (idkw.identifier().caseSensitive()) {
if (matchedText.equals(idkw.keyword())) {
matches.add(new MatchTextRange(idkw.identifier(), new TextRange(start, end)));
}
} else {
matches.add(new MatchTextRange(idkw.identifier(), new TextRange(start, end)));
}
}
}
protected void addMatchPositionsForHit(TextContext textContext, List<MatchPosition> matches, Hit hit) {
int start = textContext.getStart(hit.begin);
int end = textContext.getEnd(hit.end);
String matchedText = textContext.getMatchedText(hit.begin, hit.end);
List<DictionaryIdentifierWithKeyword> idWithKeywords = hit.value;
for (DictionaryIdentifierWithKeyword idkw : idWithKeywords) {
MatchPosition matchPosition = new MatchPosition(idkw.identifier(), start, end);
if (idkw.identifier().caseSensitive()) {
if (matchedText.equals(idkw.keyword())) {
matches.add(matchPosition);
}
} else {
matches.add(matchPosition);
}
}
}
protected interface HitHandler {
void handle(int begin, int end, List<DictionaryIdentifierWithKeyword> value);
}
protected static class Hit {
final int begin;
final int end;
final List<DictionaryIdentifierWithKeyword> value;
Hit(int begin, int end, List<DictionaryIdentifierWithKeyword> value) {
this.begin = begin;
this.end = end;
this.value = value;
}
}
}

View File

@ -0,0 +1,32 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.List;
import java.util.Map;
import com.roklenarcic.util.strings.AhoCorasickMap;
import com.roklenarcic.util.strings.MapMatchListener;
import com.roklenarcic.util.strings.StringMap;
public class AhoCorasickMapDictionarySearch extends AbstractDictionarySearch {
private final StringMap<List<DictionaryIdentifierWithKeyword>> map;
public AhoCorasickMapDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
super(keyWordToIdentifiersMap);
map = new AhoCorasickMap<>(keyWordToIdentifiersMap.keySet(), keyWordToIdentifiersMap.values(), false);
}
@Override
protected void parseText(CharSequence text, HitHandler handler) {
MapMatchListener<List<DictionaryIdentifierWithKeyword>> listener = (haystack, startPosition, endPosition, value) -> {
handler.handle(startPosition, endPosition, value);
return true;
};
map.match(text.toString(), listener);
}
}

View File

@ -2,19 +2,14 @@ package com.iqser.red.service.redaction.v1.server.model.dictionary;
import static java.lang.String.format;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.utils.Patterns;
@ -29,31 +24,70 @@ import lombok.Getter;
@Data
public class Dictionary {
@Getter
private List<DictionaryModel> dictionaryModels;
// todo: dossier and dossier template level DictionaryModels override each other
// at the moment there are no problems because they always have the same rank / hint information
// but it should be changed so that the localAccessMap contains all models
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
private final Map<String, Map<Level, DictionaryModel>> localAccessMap = new HashMap<>();
@Getter
private DictionaryVersion version;
private final DictionaryVersion version;
private final DictionarySearch dictionarySearch;
public enum Level {
DOSSIER_TEMPLATE,
DOSSIER
}
public Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version) {
Dictionary(List<DictionaryModel> dictionaryModels, DictionaryVersion version, DictionarySearch dictionarySearch) {
this.dictionaryModels = dictionaryModels;
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), Map.of(getLevel(dm.isDossierDictionary()), dm)));
this.version = version;
this.dictionarySearch = dictionarySearch;
}
private Level getLevel(boolean isDossierDictionary) {
return isDossierDictionary ? Level.DOSSIER : Level.DOSSIER_TEMPLATE;
}
/**
* Determines the default level for a given type based on the levels present.
* If both levels are present, it defaults to {@code Level.DOSSIER}.
*
* @param type The type to determine the default level for.
* @return The default {@link Level} for the specified type.
* @throws NotFoundException If the type is not found in the dictionary.
*/
private Level getDefaultLevel(String type) {
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
if (levelMap == null || levelMap.isEmpty()) {
throw new NotFoundException("Type: " + type + " is not found");
}
if (levelMap.containsKey(Level.DOSSIER)) {
return Level.DOSSIER;
} else {
// Use whatever level is present
return levelMap.keySet()
.iterator().next();
}
}
public int getDictionaryRank(String type, Level level) {
if (!localAccessMap.containsKey(type)) {
return 0;
}
DictionaryModel model = localAccessMap.get(type)
.get(level);
return model != null ? model.getRank() : 0;
}
public int getDictionaryRank(String type) {
if (!localAccessMap.containsKey(type)) {
return 0;
}
return localAccessMap.get(type).getRank();
return getDictionaryRank(type, getDefaultLevel(type));
}
@ -64,11 +98,21 @@ public class Dictionary {
*/
public boolean hasLocalEntries() {
return dictionaryModels.stream()
return getDictionaryModels().stream()
.anyMatch(dm -> !dm.getLocalEntriesWithMatchedRules().isEmpty());
}
public List<DictionaryModel> getDictionaryModels() {
return localAccessMap.values()
.stream()
.flatMap(levelDictionaryModelMap -> levelDictionaryModelMap.values()
.stream())
.toList();
}
public Set<String> getTypes() {
return localAccessMap.keySet();
@ -76,56 +120,144 @@ public class Dictionary {
/**
* Retrieves the {@link DictionaryModel} of a specified type.
* Retrieves the {@link DictionaryModel} of a specified type and level.
*
* @param type The type of dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type.
* @throws NotFoundException If the specified type is not found in the dictionary.
* @param type The type of dictionary model to retrieve.
* @param level The level of the dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type and level.
* @throws NotFoundException If the specified type or level is not found in the dictionary.
*/
public DictionaryModel getType(String type) {
public DictionaryModel getType(String type, Level level) {
DictionaryModel model = localAccessMap.get(type);
if (model == null) {
throw new NotFoundException("Type: " + type + " is not found");
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
if (levelMap == null || !levelMap.containsKey(level)) {
throw new NotFoundException("Type: " + type + " with level: " + level + " is not found");
}
return model;
return levelMap.get(level);
}
/**
* Checks if the dictionary of a specific type is considered a hint.
* Retrieves the {@link DictionaryModel} of a specified type at the default level.
*
* @param type The type of dictionary model to retrieve.
* @return The {@link DictionaryModel} of the specified type at the default level.
* @throws NotFoundException If the specified type is not found in the dictionary.
*/
public DictionaryModel getType(String type) {
return getType(type, getDefaultLevel(type));
}
/**
* Checks if the dictionary of a specific type and level is considered a hint.
*
* @param type The type of dictionary to check.
* @param level The level of the dictionary to check.
* @return true if the dictionary model is marked as a hint, false otherwise.
*/
public boolean isHint(String type, Level level) {
DictionaryModel model = localAccessMap.get(type)
.get(level);
return model != null && model.isHint();
}
/**
* Checks if the dictionary of a specific type is considered a hint at the default level.
*
* @param type The type of dictionary to check.
* @return true if the dictionary model is marked as a hint, false otherwise.
*/
public boolean isHint(String type) {
DictionaryModel model = localAccessMap.get(type);
if (model != null) {
return model.isHint();
}
return false;
return isHint(type, getDefaultLevel(type));
}
/**
* Checks if the dictionary of a specific type is case-insensitive.
* Checks if the dictionary of a specific type and level is case-insensitive.
*
* @param type The type of dictionary to check.
* @param level The level of the dictionary to check.
* @return true if the dictionary is case-insensitive, false otherwise.
*/
public boolean isCaseInsensitiveDictionary(String type, Level level) {
DictionaryModel dictionaryModel = localAccessMap.get(type)
.get(level);
return dictionaryModel != null && dictionaryModel.isCaseInsensitive();
}
/**
* Checks if the dictionary of a specific type is case-insensitive at the default level.
*
* @param type The type of dictionary to check.
* @return true if the dictionary is case-insensitive, false otherwise.
*/
public boolean isCaseInsensitiveDictionary(String type) {
DictionaryModel dictionaryModel = localAccessMap.get(type);
if (dictionaryModel != null) {
return dictionaryModel.isCaseInsensitive();
}
return false;
return isCaseInsensitiveDictionary(type, getDefaultLevel(type));
}
/**
* Adds a local dictionary entry of a specific type.
* Adds a local dictionary entry of a specific type and level.
*
* @param type The type of dictionary to add the entry to.
* @param value The value of the entry.
* @param matchedRules A collection of {@link MatchedRule} associated with the entry.
* @param alsoAddLastname Indicates whether to also add the lastname separately as an entry.
* @param level The level of the dictionary where the entry should be added.
* @throws IllegalArgumentException If the specified type does not exist within the dictionary, if the type
* does not have any local entries defined, or if the provided value is
* blank. This ensures that only valid, non-empty entries
* are added to the dictionary.
*/
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname, Level level) {
if (value.isBlank()) {
return;
}
Map<Level, DictionaryModel> levelMap = localAccessMap.get(type);
if (levelMap == null || !levelMap.containsKey(level)) {
throw new IllegalArgumentException(format("DictionaryModel of type %s with level %s does not exist", type, level));
}
DictionaryModel dictionaryModel = levelMap.get(level);
if (dictionaryModel.getLocalEntriesWithMatchedRules() == null) {
throw new IllegalArgumentException(format("DictionaryModel of type %s has no local Entries", type));
}
if (StringUtils.isEmpty(value)) {
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
}
boolean isCaseInsensitive = dictionaryModel.isCaseInsensitive();
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
String cleanedValue = value;
if (isCaseInsensitive) {
cleanedValue = cleanedValue.toLowerCase(Locale.US);
}
dictionaryModel.getLocalEntriesWithMatchedRules()
.merge(cleanedValue.trim(),
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
if (alsoAddLastname) {
String lastname = cleanedValue.split(" ")[0];
dictionaryModel.getLocalEntriesWithMatchedRules()
.merge(lastname,
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
}
}
/**
* Adds a local dictionary entry of a specific type at the default level.
*
* @param type The type of dictionary to add the entry to.
* @param value The value of the entry.
@ -138,40 +270,7 @@ public class Dictionary {
*/
private void addLocalDictionaryEntry(String type, String value, Collection<MatchedRule> matchedRules, boolean alsoAddLastname) {
if (value.isBlank()) {
return;
}
if (localAccessMap.get(type) == null) {
throw new IllegalArgumentException(format("DictionaryModel of type %s does not exist", type));
}
if (localAccessMap.get(type).getLocalEntriesWithMatchedRules() == null) {
throw new IllegalArgumentException(format("DictionaryModel of type %s has no local Entries", type));
}
if (StringUtils.isEmpty(value)) {
throw new IllegalArgumentException(format("%s is not a valid dictionary entry", value));
}
boolean isCaseInsensitive = localAccessMap.get(type).isCaseInsensitive();
Set<MatchedRule> matchedRulesSet = new HashSet<>(matchedRules);
String cleanedValue = value;
if (isCaseInsensitive) {
cleanedValue = cleanedValue.toLowerCase(Locale.US);
}
localAccessMap.get(type)
.getLocalEntriesWithMatchedRules()
.merge(cleanedValue.trim(),
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
if (alsoAddLastname) {
String lastname = cleanedValue.split(" ")[0];
localAccessMap.get(type)
.getLocalEntriesWithMatchedRules()
.merge(lastname,
matchedRulesSet,
(set1, set2) -> Stream.concat(set1.stream(), set2.stream())
.collect(Collectors.toSet()));
}
addLocalDictionaryEntry(type, value, matchedRules, alsoAddLastname, getDefaultLevel(type));
}
@ -179,10 +278,22 @@ public class Dictionary {
* Recommends a text entity for inclusion in every dictionary model without separating the last name.
*
* @param textEntity The {@link TextEntity} to be recommended.
* @param level The level of the dictionary where the recommendation should be added.
*/
public void recommendEverywhere(TextEntity textEntity, Level level) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false, level);
}
/**
* Recommends a text entity for inclusion in every dictionary model without separating the last name at the default level.
*
* @param textEntity The {@link TextEntity} to be recommended.
*/
public void recommendEverywhere(TextEntity textEntity) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), false);
recommendEverywhere(textEntity, getDefaultLevel(textEntity.type()));
}
@ -190,10 +301,22 @@ public class Dictionary {
* Recommends a text entity for inclusion in every dictionary model with the last name added separately.
*
* @param textEntity The {@link TextEntity} to be recommended.
* @param level The level of the dictionary where the recommendation should be added.
*/
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity, Level level) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true, level);
}
/**
* Recommends a text entity for inclusion in every dictionary model with the last name added separately at the default level.
*
* @param textEntity The {@link TextEntity} to be recommended.
*/
public void recommendEverywhereWithLastNameSeparately(TextEntity textEntity) {
addLocalDictionaryEntry(textEntity.type(), textEntity.getValue(), textEntity.getMatchedRuleList(), true);
recommendEverywhereWithLastNameSeparately(textEntity, getDefaultLevel(textEntity.type()));
}
@ -201,11 +324,22 @@ public class Dictionary {
* Adds multiple author names contained within a text entity as recommendations in the dictionary.
*
* @param textEntity The {@link TextEntity} containing author names to be added.
* @param level The level of the dictionary where the recommendations should be added.
*/
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity, Level level) {
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true, level));
}
/**
* Adds multiple author names contained within a text entity as recommendations in the dictionary at the default level.
*
* @param textEntity The {@link TextEntity} containing author names to be added.
*/
public void addMultipleAuthorsAsRecommendation(TextEntity textEntity) {
splitIntoAuthorNames(textEntity).forEach(authorName -> addLocalDictionaryEntry(textEntity.type(), authorName, textEntity.getMatchedRuleList(), true));
addMultipleAuthorsAsRecommendation(textEntity, getDefaultLevel(textEntity.type()));
}

View File

@ -0,0 +1,90 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@Service
@RequiredArgsConstructor
public class DictionaryFactory {
@SneakyThrows
public Dictionary create(List<DictionaryModel> dictionaryModels, DictionaryVersion dictionaryVersion) {
Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap = computeStringIdentifiersMap(dictionaryModels);
DictionarySearch dictionarySearch = getDictionarySearch(keyWordToIdentifiersMap);
return new Dictionary(dictionaryModels, dictionaryVersion, dictionarySearch);
}
private static DictionarySearch getDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
// a more sophisticated selection of the dictionarySearch could be done here
// but as we do not have the need to fine-tune at the moment we use the all-rounder solution, which is the AhoCoraSickMapDictionarySearch
// based on this repository https://github.com/RokLenarcic/AhoCorasick
// This is an outline how a more complex dictionarySearch decision could be made:
// if (!redactionServiceSettings.isPriorityMode() && keyWordToIdentifiersMap.keySet().size() < 50_000) {
// dictionarySearch = new DoubleArrayTrieDictionarySearch(keyWordToIdentifiersMap);
// } else {
// dictionarySearch = new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
// }
return new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
}
protected static Map<String, List<DictionaryIdentifierWithKeyword>> computeStringIdentifiersMap(List<DictionaryModel> dictionaryModels) {
Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap = new HashMap<>();
for (DictionaryModel model : dictionaryModels) {
// Add entries for different entity types
addEntriesToMap(stringToIdentifiersMap, model, model.isHint() ? EntityType.HINT : EntityType.ENTITY, model.getEntries(), false);
addEntriesToMap(stringToIdentifiersMap, model, EntityType.FALSE_POSITIVE, model.getFalsePositives(), false);
addEntriesToMap(stringToIdentifiersMap, model, EntityType.FALSE_RECOMMENDATION, model.getFalseRecommendations(), false);
if (model.isDossierDictionary()) {
addEntriesToMap(stringToIdentifiersMap, model, EntityType.DICTIONARY_REMOVAL, model.getEntries(), true);
}
}
return stringToIdentifiersMap;
}
private static void addEntriesToMap(Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap,
DictionaryModel model,
EntityType entityType,
Set<DictionaryEntryModel> entries,
boolean isDeleted) {
DictionaryIdentifier identifier = new DictionaryIdentifier(model.getType(), entityType, model.isDossierDictionary(), !model.isCaseInsensitive());
List<String> values = entries.stream()
.filter(entry -> entry.isDeleted() == isDeleted)
.map(DictionaryEntry::getValue)
.toList();
for (String value : values) {
DictionaryIdentifierWithKeyword idWithKeyword = new DictionaryIdentifierWithKeyword(identifier, value);
String key = value.toLowerCase(Locale.ROOT);
stringToIdentifiersMap.computeIfAbsent(key, k -> new ArrayList<>()).add(idWithKeyword);
}
}
}

View File

@ -0,0 +1,8 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
public record DictionaryIdentifier(String type, EntityType entityType, boolean dossierDictionaryEntry, boolean caseSensitive) {
}

View File

@ -0,0 +1,51 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import org.ahocorasick.trie.PayloadEmit;
import org.ahocorasick.trie.PayloadTrie;
import java.util.Collection;
public final class DictionaryIdentifierTrie {
private final PayloadTrie<DictionaryIdentifier> trie;
private DictionaryIdentifierTrie(PayloadTrie<DictionaryIdentifier> trie) {
this.trie = trie;
}
public static class DictionaryIdentifierTrieBuilder {
private final PayloadTrie.PayloadTrieBuilder<DictionaryIdentifier> builder;
public DictionaryIdentifierTrieBuilder() {
this.builder = PayloadTrie.builder();
}
public DictionaryIdentifierTrieBuilder ignoreCase() {
builder.ignoreCase();
return this;
}
public DictionaryIdentifierTrieBuilder addKeyword(String keyword, DictionaryIdentifier payload) {
builder.addKeyword(keyword, payload);
return this;
}
public DictionaryIdentifierTrieBuilder addKeywords(Collection<String> keywords, DictionaryIdentifier payload) {
for (String keyword : keywords) {
builder.addKeyword(keyword, payload);
}
return this;
}
public DictionaryIdentifierTrie build() {
return new DictionaryIdentifierTrie(builder.build());
}
}
public Collection<PayloadEmit<DictionaryIdentifier>> parseText(CharSequence text) {
return trie.parseText(text);
}
public boolean containsMatch(CharSequence text) {
return trie.containsMatch(text);
}
}

View File

@ -0,0 +1,5 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
public record DictionaryIdentifierWithKeyword(DictionaryIdentifier identifier, String keyword) {
}

View File

@ -1,13 +1,12 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.type.Type;
import com.iqser.red.service.redaction.v1.server.model.document.entity.MatchedRule;
import lombok.Data;
@ -21,7 +20,7 @@ import lombok.extern.slf4j.Slf4j;
*/
@Data
@Slf4j
public class DictionaryModel implements Serializable {
public class DictionaryModel implements Cloneable {
private final String type;
private final int rank;
@ -33,13 +32,8 @@ public class DictionaryModel implements Serializable {
private final Set<DictionaryEntryModel> falsePositives;
private final Set<DictionaryEntryModel> falseRecommendations;
private transient SearchImplementation entriesSearch;
private transient SearchImplementation deletionEntriesSearch;
private transient SearchImplementation falsePositiveSearch;
private transient SearchImplementation falseRecommendationsSearch;
private final HashMap<String, Set<MatchedRule>> localEntriesWithMatchedRules = new HashMap<>();
private transient SearchImplementation localSearch;
private SearchImplementation localSearch;
/**
@ -91,74 +85,6 @@ public class DictionaryModel implements Serializable {
}
/**
* Returns the search implementation for non-deleted dictionary entries.
*
* @return The {@link SearchImplementation} for non-deleted dictionary entries.
*/
public SearchImplementation getEntriesSearch() {
if (entriesSearch == null) {
this.entriesSearch = new SearchImplementation(this.entries.stream()
.filter(e -> !e.isDeleted())
.map(DictionaryEntry::getValue)
.collect(Collectors.toList()), caseInsensitive);
}
return entriesSearch;
}
/**
* Returns the search implementation for deleted dictionary entries.
*
* @return The {@link SearchImplementation} for deleted dictionary entries.
*/
public SearchImplementation getDeletionEntriesSearch() {
if (deletionEntriesSearch == null) {
this.deletionEntriesSearch = new SearchImplementation(this.entries.stream()
.filter(DictionaryEntry::isDeleted)
.map(DictionaryEntry::getValue)
.collect(Collectors.toList()), caseInsensitive);
}
return deletionEntriesSearch;
}
/**
* Returns the search implementation for non-deleted false positive entries.
*
* @return The {@link SearchImplementation} for non-deleted false positive entries.
*/
public SearchImplementation getFalsePositiveSearch() {
if (falsePositiveSearch == null) {
this.falsePositiveSearch = new SearchImplementation(this.falsePositives.stream()
.filter(e -> !e.isDeleted())
.map(DictionaryEntry::getValue)
.collect(Collectors.toList()), caseInsensitive);
}
return falsePositiveSearch;
}
/**
* Returns the search implementation for non-deleted false recommendation entries.
*
* @return The {@link SearchImplementation} for non-deleted false recommendation entries.
*/
public SearchImplementation getFalseRecommendationsSearch() {
if (falseRecommendationsSearch == null) {
this.falseRecommendationsSearch = new SearchImplementation(this.falseRecommendations.stream()
.filter(e -> !e.isDeleted())
.map(DictionaryEntry::getValue)
.collect(Collectors.toList()), caseInsensitive);
}
return falseRecommendationsSearch;
}
/**
* Retrieves the matched rules for a given value from the local dictionary entries.
* The value is processed based on the case sensitivity of the dictionary.
@ -172,4 +98,149 @@ public class DictionaryModel implements Serializable {
return localEntriesWithMatchedRules.get(cleanedValue);
}
@Override
public DictionaryModel clone() {
try {
DictionaryModel cloned = (DictionaryModel) super.clone();
cloned.localSearch = null;
return cloned;
} catch (CloneNotSupportedException e) {
throw new AssertionError("Cloning not supported", e);
}
}
public void addNewEntries(long versionThreshold, Set<DictionaryIncrementValue> newValues) {
getEntries().forEach(entry -> {
if (entry.getVersion() > versionThreshold) {
newValues.add(new DictionaryIncrementValue(entry.getValue(), isCaseInsensitive()));
}
});
getFalsePositives().forEach(entry -> {
if (entry.getVersion() > versionThreshold) {
newValues.add(new DictionaryIncrementValue(entry.getValue(), isCaseInsensitive()));
}
});
getFalseRecommendations().forEach(entry -> {
if (entry.getVersion() > versionThreshold) {
newValues.add(new DictionaryIncrementValue(entry.getValue(), isCaseInsensitive()));
}
});
}
public void handleOldEntries(Type newType,
DictionaryEntries newEntries,
Set<DictionaryEntryModel> combinedEntries,
Set<DictionaryEntryModel> combinedFalsePositives,
Set<DictionaryEntryModel> combinedFalseRecommendations) {
if (isCaseInsensitive() && !newType.isCaseInsensitive()) {
// Compute new entries' values in lowercase once
Set<String> newEntryValuesLower = newEntries.getEntries()
.stream()
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());
combinedEntries.addAll(getEntries()
.stream()
.filter(f -> !newEntryValuesLower.contains(f.getValue()))
.collect(Collectors.toSet()));
// Similarly for false positives
Set<String> newFalsePositivesValuesLower = newEntries.getFalsePositives()
.stream()
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());
combinedFalsePositives.addAll(getFalsePositives()
.stream()
.filter(f -> !newFalsePositivesValuesLower.contains(f.getValue()))
.collect(Collectors.toSet()));
// Similarly for false recommendations
Set<String> newFalseRecommendationsValuesLower = newEntries.getFalseRecommendations()
.stream()
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());
combinedFalseRecommendations.addAll(getFalseRecommendations()
.stream()
.filter(f -> !newFalseRecommendationsValuesLower.contains(f.getValue()))
.collect(Collectors.toSet()));
} else if (!isCaseInsensitive() && newType.isCaseInsensitive()) {
// Compute new entries' values in lowercase once
Set<String> newEntryValuesLower = newEntries.getEntries()
.stream()
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());
combinedEntries.addAll(getEntries()
.stream()
.filter(f -> !newEntryValuesLower.contains(f.getValue().toLowerCase(Locale.ROOT)))
.collect(Collectors.toSet()));
// Similarly for false positives
Set<String> newFalsePositivesValuesLower = newEntries.getFalsePositives()
.stream()
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());
combinedFalsePositives.addAll(getFalsePositives()
.stream()
.filter(f -> !newFalsePositivesValuesLower.contains(f.getValue().toLowerCase(Locale.ROOT)))
.collect(Collectors.toSet()));
// Similarly for false recommendations
Set<String> newFalseRecommendationsValuesLower = newEntries.getFalseRecommendations()
.stream()
.map(s -> s.getValue().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());
combinedFalseRecommendations.addAll(getFalseRecommendations()
.stream()
.filter(f -> !newFalseRecommendationsValuesLower.contains(f.getValue().toLowerCase(Locale.ROOT)))
.collect(Collectors.toSet()));
} else {
// Both have the same case sensitivity
Set<String> newEntryValues = newEntries.getEntries()
.stream()
.map(DictionaryEntryModel::getValue)
.collect(Collectors.toSet());
combinedEntries.addAll(getEntries()
.stream()
.filter(f -> !newEntryValues.contains(f.getValue()))
.collect(Collectors.toSet()));
// Similarly for false positives
Set<String> newFalsePositivesValues = newEntries.getFalsePositives()
.stream()
.map(DictionaryEntryModel::getValue)
.collect(Collectors.toSet());
combinedFalsePositives.addAll(getFalsePositives()
.stream()
.filter(f -> !newFalsePositivesValues.contains(f.getValue()))
.collect(Collectors.toSet()));
// Similarly for false recommendations
Set<String> newFalseRecommendationsValues = newEntries.getFalseRecommendations()
.stream()
.map(DictionaryEntryModel::getValue)
.collect(Collectors.toSet());
combinedFalseRecommendations.addAll(getFalseRecommendations()
.stream()
.filter(f -> !newFalseRecommendationsValues.contains(f.getValue()))
.collect(Collectors.toSet()));
}
}
}

View File

@ -0,0 +1,86 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.List;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
/**
* Common interface for dictionary search implementations.
*/
public interface DictionarySearch {
/**
* Retrieves a list of match boundaries within the given text.
*
* @param text The text to search within.
* @return A list of MatchTextRange representing the boundaries of matches.
*/
default List<MatchTextRange> getBoundariesAsList(CharSequence text) {
return getBoundaries(text).toList();
}
/**
* Retrieves a stream of match boundaries within the given text.
*
* @param text The text to search within.
* @return A stream of MatchTextRange representing the boundaries of matches.
*/
Stream<MatchTextRange> getBoundaries(CharSequence text);
/**
* Retrieves a list of match boundaries within a specified region of the text.
*
* @param text The text to search within.
* @param region The specific region of the text to search.
* @return A list of MatchTextRange representing the boundaries of matches.
*/
default List<MatchTextRange> getBoundariesAsList(CharSequence text, TextRange region) {
return getBoundaries(text, region).toList();
}
/**
* Retrieves a stream of match boundaries within a specified region of the text.
*
* @param text The text to search within.
* @param region The specific region of the text to search.
* @return A stream of MatchTextRange representing the boundaries of matches.
*/
Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region);
/**
* Retrieves a stream of match boundaries within the given TextBlock.
*
* @param textBlock The TextBlock to search within.
* @return A stream of MatchTextRange representing the boundaries of matches.
*/
Stream<MatchTextRange> getBoundaries(TextBlock textBlock);
/**
* Retrieves a list of match positions within the given text.
*
* @param text The text to search within.
* @return A list of MatchPosition representing the positions of matches.
*/
default List<MatchPosition> getMatchesAsList(String text) {
return getMatches(text).toList();
}
/**
* Retrieves a stream of match positions within the given text.
*
* @param text The text to search within.
* @return A stream of MatchPosition representing the positions of matches.
*/
Stream<MatchPosition> getMatches(String text);
/**
* Record representing the range of matched text along with its identifier.
*/
record MatchTextRange(DictionaryIdentifier identifier, TextRange textRange) {}
/**
* Record representing the start and end positions of a match along with its identifier.
*/
record MatchPosition(DictionaryIdentifier identifier, int startIndex, int endIndex) {}
}

View File

@ -0,0 +1,31 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.List;
import java.util.Map;
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
public class DoubleArrayTrieDictionarySearch extends AbstractDictionarySearch {
private final AhoCorasickDoubleArrayTrie<List<DictionaryIdentifierWithKeyword>> trie;
public DoubleArrayTrieDictionarySearch(Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap) {
super(keyWordToIdentifiersMap);
trie = new AhoCorasickDoubleArrayTrie<>();
trie.build(keyWordToIdentifiersMap);
}
@Override
protected void parseText(CharSequence text, HitHandler handler) {
List<AhoCorasickDoubleArrayTrie.Hit<List<DictionaryIdentifierWithKeyword>>> hits = trie.parseText(text);
for (AhoCorasickDoubleArrayTrie.Hit<List<DictionaryIdentifierWithKeyword>> hit : hits) {
handler.handle(hit.begin, hit.end, hit.value);
}
}
}

View File

@ -0,0 +1,138 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
public class DoubleTrieDictionarySearch implements DictionarySearch {
private final Map<DictionaryIdentifier, List<String>> caseSensitiveEntries = new HashMap<>();
private final Map<DictionaryIdentifier, List<String>> caseInsensitiveEntries = new HashMap<>();
private final DictionaryIdentifierTrie caseSensitiveTrie;
private final DictionaryIdentifierTrie caseInsensitiveTrie;
public DoubleTrieDictionarySearch(Map<DictionaryIdentifier, List<String>> dictionaryValues) {
for (Map.Entry<DictionaryIdentifier, List<String>> entry : dictionaryValues.entrySet()) {
DictionaryIdentifier identifier = entry.getKey();
List<String> values = entry.getValue();
if (identifier.caseSensitive()) {
caseSensitiveEntries.put(identifier, values);
} else {
caseInsensitiveEntries.put(identifier, values);
}
}
this.caseSensitiveTrie = createTrie(caseSensitiveEntries, false);
this.caseInsensitiveTrie = createTrie(caseInsensitiveEntries, true);
}
private DictionaryIdentifierTrie createTrie(Map<DictionaryIdentifier, List<String>> entries, boolean ignoreCase) {
if (entries.isEmpty()) {
return null;
}
DictionaryIdentifierTrie.DictionaryIdentifierTrieBuilder builder = new DictionaryIdentifierTrie.DictionaryIdentifierTrieBuilder();
if (ignoreCase) {
builder.ignoreCase();
}
entries.forEach((identifier, values) -> {
for (String value : values) {
builder.addKeyword(value, identifier);
}
});
return builder.build();
}
public boolean atLeastOneMatches(String text) {
if (!caseSensitiveEntries.isEmpty() && caseSensitiveTrie != null && caseSensitiveTrie.containsMatch(text)) {
return true;
}
return !caseInsensitiveEntries.isEmpty() && caseInsensitiveTrie != null && caseInsensitiveTrie.containsMatch(text);
}
@Override
public Stream<MatchTextRange> getBoundaries(CharSequence text) {
List<MatchTextRange> matches = new ArrayList<>();
addMatchTextRangesForTrie(caseSensitiveEntries, caseSensitiveTrie, matches, text);
addMatchTextRangesForTrie(caseInsensitiveEntries, caseInsensitiveTrie, matches, text);
return matches.stream();
}
@Override
public Stream<MatchTextRange> getBoundaries(TextBlock textBlock) {
return getBoundaries(textBlock, textBlock.getTextRange());
}
@Override
public Stream<MatchTextRange> getBoundaries(CharSequence text, TextRange region) {
List<MatchTextRange> matches = new ArrayList<>();
addMatchTextRangesForTrie(text, region, matches, caseSensitiveEntries, caseSensitiveTrie);
addMatchTextRangesForTrie(text, region, matches, caseInsensitiveEntries, caseInsensitiveTrie);
return matches.stream();
}
@Override
public Stream<MatchPosition> getMatches(String text) {
List<MatchPosition> matches = new ArrayList<>();
addMatchPositionsForTrie(caseSensitiveEntries, caseSensitiveTrie, matches, text);
addMatchPositionsForTrie(caseInsensitiveEntries, caseInsensitiveTrie, matches, text);
return matches.stream();
}
private void addMatchTextRangesForTrie(Map<DictionaryIdentifier, List<String>> entries, DictionaryIdentifierTrie trie, List<MatchTextRange> matches, CharSequence text) {
if (!entries.isEmpty() && trie != null) {
matches.addAll(trie.parseText(text)
.stream()
.map(r -> new MatchTextRange(r.getPayload(), new TextRange(r.getStart(), r.getEnd() + 1)))
.toList());
}
}
private void addMatchTextRangesForTrie(CharSequence text,
TextRange region,
List<MatchTextRange> matches,
Map<DictionaryIdentifier, List<String>> entries,
DictionaryIdentifierTrie trie) {
if (!entries.isEmpty() && trie != null) {
CharSequence subSequence = text.subSequence(region.start(), region.end());
matches.addAll(trie.parseText(subSequence)
.stream()
.map(r -> new MatchTextRange(r.getPayload(), new TextRange(r.getStart() + region.start(), r.getEnd() + region.start() + 1)))
.toList());
}
}
private void addMatchPositionsForTrie(Map<DictionaryIdentifier, List<String>> entries, DictionaryIdentifierTrie trie, List<MatchPosition> matches, String text) {
if (!entries.isEmpty() && trie != null) {
matches.addAll(trie.parseText(text)
.stream()
.map(r -> new MatchPosition(r.getPayload(), r.getStart(), r.getEnd() + 1))
.toList());
}
}
}

View File

@ -0,0 +1,46 @@
package com.iqser.red.service.redaction.v1.server.model.dictionary;
import java.util.Locale;
import lombok.Getter;
public class TextContext {
private final CharSequence text;
@Getter
private final String lowerText;
private final int offset;
TextContext(CharSequence text, int offset) {
this.text = text;
this.lowerText = text.toString().toLowerCase(Locale.ROOT);
this.offset = offset;
}
TextContext(CharSequence text) {
this(text, 0);
}
public int getStart(int hitBegin) {
return hitBegin + offset;
}
public int getEnd(int hitEnd) {
return hitEnd + offset;
}
public String getMatchedText(int hitBegin, int hitEnd) {
return text.subSequence(hitBegin, hitEnd).toString();
}
}

View File

@ -8,7 +8,6 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.function.Supplier;
import java.util.stream.Collectors;
@ -104,7 +103,7 @@ public class AnalysisPreparationService {
CompletableFuture.allOf(kieWrapperEntityRulesFuture, kieWrapperComponentRulesFuture, documentFuture, importedRedactionsFuture, nerEntitiesFuture).join();
Dictionary dictionary = getDictionary(analyzeRequest);
Dictionary dictionary = dictionaryService.getDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
Document document = documentFuture.get();
ImportedRedactions importedRedactions = importedRedactionsFuture.get();
@ -195,7 +194,7 @@ public class AnalysisPreparationService {
taskExecutor);
CompletableFuture<DictionaryAndNotFoundEntries> dictionaryAndNotFoundEntriesCompletableFuture = CompletableFuture.supplyAsync(() -> {
Dictionary dictionary = getDictionary(analyzeRequest);
Dictionary dictionary = dictionaryService.getDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
NotFoundEntries notFoundEntries = getNotFoundEntries(analyzeRequest, reanalysisSetupData.document(), reanalysisInitialProcessingData.importedRedactions());
return new DictionaryAndNotFoundEntries(dictionary, notFoundEntries.notFoundManualRedactionEntries(), notFoundEntries.notFoundImportedEntries());
}, taskExecutor);
@ -253,15 +252,6 @@ public class AnalysisPreparationService {
}
private Dictionary getDictionary(AnalyzeRequest analyzeRequest) {
dictionaryService.updateDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(analyzeRequest.getDossierTemplateId(), analyzeRequest.getDossierId());
log.info("Updated Dictionaries for file {} in dossier {}", analyzeRequest.getFileId(), analyzeRequest.getDossierId());
return dictionary;
}
private NotFoundEntries getNotFoundEntries(AnalyzeRequest analyzeRequest, Document document, ImportedRedactions importedRedactions) {
var notFoundManualRedactionEntries = manualRedactionEntryService.addManualRedactionEntriesAndReturnNotFoundEntries(analyzeRequest,

View File

@ -0,0 +1,107 @@
package com.iqser.red.service.redaction.v1.server.service;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.springframework.stereotype.Service;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryRepresentation;
import com.iqser.red.service.redaction.v1.server.model.dictionary.TenantDictionary;
import com.knecon.fforesight.tenantcommons.TenantContext;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class DictionaryCacheService {
private final LoadingCache<String, TenantDictionary> tenantDictionaryCache;
private final Cache<DictionaryCacheKey, Dictionary> dictionaryCache;
public DictionaryCacheService(RedactionServiceSettings settings) {
tenantDictionaryCache = CacheBuilder.newBuilder()
.maximumSize(settings.getDictionaryCacheMaximumSize())
.expireAfterAccess(settings.getDictionaryCacheExpireAfterAccessDays(), TimeUnit.DAYS)
.build(new CacheLoader<>() {
public TenantDictionary load(String key) {
return new TenantDictionary();
}
});
dictionaryCache = CacheBuilder.newBuilder()
.maximumSize(settings.getFirstLevelDictionaryCacheMaximumSize())
.expireAfterAccess(settings.getDictionaryCacheExpireAfterAccessDays(), TimeUnit.DAYS)
.build();
}
public void clearAllCaches() {
tenantDictionaryCache.invalidateAll();
dictionaryCache.invalidateAll();
}
@SneakyThrows
public DictionaryRepresentation getDossierTemplateDictionary(String dossierTemplateId) {
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate()
.get(dossierTemplateId);
}
@SneakyThrows
public DictionaryRepresentation getDossierDictionary(String dossierId) {
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier()
.get(dossierId);
}
@SneakyThrows
public void addDictionaryRepresentationForDossierTemplate(String dossierTemplateId, DictionaryRepresentation dictionaryRepresentation) {
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate().put(dossierTemplateId, dictionaryRepresentation);
}
@SneakyThrows
public void addDictionaryRepresentationForDossier(String dossierId, DictionaryRepresentation dictionaryRepresentation) {
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier().put(dossierId, dictionaryRepresentation);
}
@SneakyThrows
public Optional<Dictionary> getDictionary(String tenantId, String dossierId) {
return Optional.ofNullable(dictionaryCache.getIfPresent(new DictionaryCacheKey(tenantId, dossierId)));
}
@SneakyThrows
public void putDictionary(String tenantId, String dossierId, Dictionary newDictionary) {
dictionaryCache.put(new DictionaryCacheKey(tenantId, dossierId), newDictionary);
}
public record DictionaryCacheKey(String tenantId, String dossierId) {
}
}

View File

@ -38,40 +38,21 @@ public class DictionarySearchService {
@Observed(name = "DictionarySearchService", contextualName = "add-dictionary-entries")
public void addDictionaryEntities(Dictionary dictionary, SemanticNode node) {
dictionary.getDictionaryModels()
.forEach(model -> {
bySearchImplementationAsDictionary(model.getEntriesSearch(),
model.getType(),
model.isHint() ? EntityType.HINT : EntityType.ENTITY,
node,
model.isDossierDictionary());
bySearchImplementationAsDictionary(model.getFalsePositiveSearch(), model.getType(), EntityType.FALSE_POSITIVE, node, model.isDossierDictionary());
bySearchImplementationAsDictionary(model.getFalseRecommendationsSearch(), model.getType(), EntityType.FALSE_RECOMMENDATION, node, model.isDossierDictionary());
if (model.isDossierDictionary()) {
bySearchImplementationAsDictionary(model.getDeletionEntriesSearch(), model.getType(), EntityType.DICTIONARY_REMOVAL, node, model.isDossierDictionary());
}
EntityCreationService entityCreationService = new EntityCreationService(entityEnrichmentService);
dictionary.getDictionarySearch().getBoundaries(node.getTextBlock())
.filter(boundary -> entityCreationService.isValidEntityTextRange(node.getTextBlock(), boundary.textRange()))
.forEach(match -> {
Set<Engine> engines = match.identifier().dossierDictionaryEntry() ? Set.of(Engine.DOSSIER_DICTIONARY) : Set.of(Engine.DICTIONARY);
entityCreationService.byTextRangeWithEngine(match.textRange(), match.identifier().type(), match.identifier().entityType(), node, engines)
.ifPresent(entity -> {
entity.setDictionaryEntry(true);
entity.setDossierDictionaryEntry(match.identifier().dossierDictionaryEntry());
if (match.identifier().entityType().equals(EntityType.DICTIONARY_REMOVAL)) {
entity.ignore("DICT.0.0", "Ignore Dossier Dictionary Entity with DICTIONARY_REMOVAL entity type");
}
});
});
}
public void bySearchImplementationAsDictionary(SearchImplementation searchImplementation,
String type,
EntityType entityType,
SemanticNode node,
boolean isDossierDictionaryEntry) {
Set<Engine> engines = isDossierDictionaryEntry ? Set.of(Engine.DOSSIER_DICTIONARY) : Set.of(Engine.DICTIONARY);
EntityCreationService entityCreationService = new EntityCreationService(entityEnrichmentService);
searchImplementation.getBoundaries(node.getTextBlock())
.filter(boundary -> entityCreationService.isValidEntityTextRange(node.getTextBlock(), boundary))
.forEach(bounds -> entityCreationService.byTextRangeWithEngine(bounds, type, entityType, node, engines)
.ifPresent(entity -> {
entity.setDictionaryEntry(true);
entity.setDossierDictionaryEntry(isDossierDictionaryEntry);
if (entityType.equals(EntityType.DICTIONARY_REMOVAL)) {
entity.ignore("DICT.0.0", "Ignore Dossier Dictionary Entity with DICTIONARY_REMOVAL entity type");
}
}));
}
}

View File

@ -1,7 +1,6 @@
package com.iqser.red.service.redaction.v1.server.service;
import java.awt.Color;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
@ -9,38 +8,27 @@ import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.iqser.red.service.dictionarymerge.commons.CommonsDictionaryModel;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntry;
import com.iqser.red.service.dictionarymerge.commons.DictionaryEntryModel;
import com.iqser.red.service.dictionarymerge.commons.DictionaryMergeService;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.type.Type;
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryEntries;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryFactory;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryRepresentation;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.model.dictionary.TenantDictionary;
import com.knecon.fforesight.tenantcommons.TenantContext;
import feign.FeignException;
import io.micrometer.core.annotation.Timed;
import io.micrometer.observation.annotation.Observed;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -50,34 +38,40 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DictionaryService {
public static final String DEFAULT_COLOR = "#cccccc";
private final DictionaryClient dictionaryClient;
private final RedactionServiceSettings settings;
private final DictionaryMergeService dictionaryMergeService;
private LoadingCache<String, TenantDictionary> tenantDictionaryCache;
private final DictionaryCacheService dictionaryCacheService;
private final DictionaryFactory dictionaryFactory;
@PostConstruct
protected void createCache() {
@SneakyThrows
@Observed(name = "DictionaryService", contextualName = "get-dictionary")
@Timed("redactmanager_getDictionary")
public Dictionary getDictionary(String dossierTemplateId, String dossierId) {
tenantDictionaryCache = CacheBuilder.newBuilder()
.maximumSize(settings.getDictionaryCacheMaximumSize())
.expireAfterAccess(settings.getDictionaryCacheExpireAfterAccessDays(), TimeUnit.DAYS)
.build(new CacheLoader<>() {
public TenantDictionary load(String key) {
String tenantId = TenantContext.getTenantId();
return new TenantDictionary();
}
});
}
Optional<Dictionary> cachedDictionary = dictionaryCacheService.getDictionary(tenantId, dossierId);
if (cachedDictionary.isPresent()) {
log.debug("Dictionary found in cache");
boolean isUpToDate = checkIfDictionaryIsUpToDate(dossierTemplateId, dossierId, cachedDictionary.get());
if (isUpToDate) {
log.info("Returning cached Dictionary for tenantId: {}, dossierId: {}", tenantId, dossierId);
return cachedDictionary.get();
} else {
log.debug("Cached Dictionary is outdated for tenantId: {}, dossierId: {}", tenantId, dossierId);
}
} else {
log.info("No cached Dictionary found for tenantId: {}, dossierId: {}", tenantId, dossierId);
}
public void clearTenantDictionaryCache() {
DictionaryVersion latestVersion = updateDictionary(dossierTemplateId, dossierId);
Dictionary newDictionary = buildDictionary(dossierTemplateId, dossierId, latestVersion);
tenantDictionaryCache.invalidateAll();
dictionaryCacheService.putDictionary(tenantId, dossierId, newDictionary);
log.debug("Cached new Dictionary for tenantId: {}, dossierId: {}", tenantId, dossierId);
return newDictionary;
}
@ -87,19 +81,24 @@ public class DictionaryService {
public DictionaryVersion updateDictionary(String dossierTemplateId, String dossierId) {
log.debug("Updating dictionary data for dossierTemplate {} and dossier {}", dossierTemplateId, dossierId);
long dossierTemplateDictionaryVersion = dictionaryClient.getVersion(dossierTemplateId);
var dossierTemplateDictionary = getDossierTemplateDictionary(dossierTemplateId);
if (dossierTemplateDictionary == null || dossierTemplateDictionaryVersion > dossierTemplateDictionary.getDictionaryVersion()) {
updateDictionaryEntry(dossierTemplateId, dossierTemplateDictionaryVersion, getVersion(dossierTemplateDictionary), null);
// Update template dictionary
long latestTemplateVersion = dictionaryClient.getVersion(dossierTemplateId);
DictionaryRepresentation templateDictRep = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId);
if (templateDictRep == null || latestTemplateVersion > templateDictRep.getDictionaryVersion()) {
updateDictionaryEntry(dossierTemplateId, latestTemplateVersion, templateDictRep != null ? templateDictRep.getDictionaryVersion() : null, null);
}
long dossierDictionaryVersion = dictionaryClient.getVersionForDossier(dossierId);
var dossierDictionary = getDossierDictionary(dossierId);
if (dossierDictionary == null || dossierDictionaryVersion > dossierDictionary.getDictionaryVersion()) {
updateDictionaryEntry(dossierTemplateId, dossierDictionaryVersion, getVersion(dossierDictionary), dossierId);
// Update dossier dictionary
long latestDossierVersion = dictionaryClient.getVersionForDossier(dossierId);
DictionaryRepresentation dossierDictRep = dictionaryCacheService.getDossierDictionary(dossierId);
if (dossierDictRep == null || latestDossierVersion > dossierDictRep.getDictionaryVersion()) {
updateDictionaryEntry(dossierTemplateId, latestDossierVersion, dossierDictRep != null ? dossierDictRep.getDictionaryVersion() : null, dossierId);
}
return DictionaryVersion.builder().dossierTemplateVersion(dossierTemplateDictionaryVersion).dossierVersion(dossierDictionaryVersion).build();
return DictionaryVersion.builder().dossierTemplateVersion(latestTemplateVersion).dossierVersion(latestDossierVersion).build();
}
@ -108,57 +107,21 @@ public class DictionaryService {
@Timed("redactmanager_getDictionaryIncrements")
public DictionaryIncrement getDictionaryIncrements(String dossierTemplateId, DictionaryVersion fromVersion, String dossierId) {
DictionaryVersion version = updateDictionary(dossierTemplateId, dossierId);
DictionaryVersion latestVersion = updateDictionary(dossierTemplateId, dossierId);
Set<DictionaryIncrementValue> newValues = new HashSet<>();
List<DictionaryModel> dictionaryModels = getDossierTemplateDictionary(dossierTemplateId).getDictionary();
Set<DictionaryIncrementValue> newValues = Collections.synchronizedSet(new HashSet<>());
List<DictionaryModel> templateDictionaries = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries()
.forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
dictionaryModel.getFalsePositives()
.forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
dictionaryModel.getFalseRecommendations()
.forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierTemplateVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
});
templateDictionaries.parallelStream()
.forEach(dictionaryModel -> dictionaryModel.addNewEntries(fromVersion.getDossierTemplateVersion(), newValues));
if (dossierDictionaryExists(dossierId)) {
dictionaryModels = getDossierDictionary(dossierId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries()
.forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
dictionaryModel.getFalsePositives()
.forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
dictionaryModel.getFalseRecommendations()
.forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion.getDossierVersion()) {
newValues.add(new DictionaryIncrementValue(dictionaryEntry.getValue(), dictionaryModel.isCaseInsensitive()));
}
});
});
List<DictionaryModel> dossierDictionaries = dictionaryCacheService.getDossierDictionary(dossierId).getDictionary();
dossierDictionaries.parallelStream()
.forEach(dictionaryModel -> dictionaryModel.addNewEntries(fromVersion.getDossierVersion(), newValues));
}
return new DictionaryIncrement(newValues, version);
return new DictionaryIncrement(newValues, latestVersion);
}
@ -168,342 +131,164 @@ public class DictionaryService {
try {
DictionaryRepresentation dictionaryRepresentation = new DictionaryRepresentation();
var typeResponse = dossierId == null ? dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId, currentVersion, true) : dictionaryClient.getAllTypesForDossier(
dossierId,
currentVersion,
true);
List<Type> typeResponse;
if (dossierId == null) {
typeResponse = dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId, currentVersion, true);
} else {
typeResponse = dictionaryClient.getAllTypesForDossier(dossierId, currentVersion, true);
}
if (CollectionUtils.isNotEmpty(typeResponse)) {
String tenantId = TenantContext.getTenantId();
List<DictionaryModel> dictionary = typeResponse.stream()
.parallel()
.map(t -> {
TenantContext.setTenantId(tenantId);
Optional<DictionaryModel> optionalOldModel;
if (dossierId == null) {
var representation = getDossierTemplateDictionary(dossierTemplateId);
optionalOldModel = representation != null ? representation.getDictionary()
.stream()
.filter(f -> f.getType().equals(t.getType()))
.findAny() : Optional.empty();
} else {
var representation = getDossierDictionary(dossierId);
optionalOldModel = representation != null ? representation.getDictionary()
.stream()
.filter(f -> f.getType().equals(t.getType()))
.findAny() : Optional.empty();
}
Set<DictionaryEntryModel> entries = new HashSet<>();
Set<DictionaryEntryModel> falsePositives = new HashSet<>();
Set<DictionaryEntryModel> falseRecommendations = new HashSet<>();
DictionaryEntries newEntries = mapEntries(t);
var newValues = newEntries.getEntries()
.stream()
.map(DictionaryEntry::getValue)
.collect(Collectors.toSet());
var newFalsePositivesValues = newEntries.getFalsePositives()
.stream()
.map(DictionaryEntry::getValue)
.collect(Collectors.toSet());
var newFalseRecommendationsValues = newEntries.getFalseRecommendations()
.stream()
.map(DictionaryEntry::getValue)
.collect(Collectors.toSet());
optionalOldModel.ifPresent(oldDictionaryModel -> {
});
if (optionalOldModel.isPresent()) {
var oldModel = optionalOldModel.get();
if (oldModel.isCaseInsensitive() && !t.isCaseInsensitive()) {
// add old entries from existing DictionaryModel but exclude lower case representation
entries.addAll(oldModel.getEntries()
.stream()
.filter(f -> !newValues.stream()
.map(s -> s.toLowerCase(Locale.ROOT))
.toList().contains(f.getValue()))
.toList());
falsePositives.addAll(oldModel.getFalsePositives()
.stream()
.filter(f -> !newFalsePositivesValues.stream()
.map(s -> s.toLowerCase(Locale.ROOT))
.toList().contains(f.getValue()))
.toList());
falseRecommendations.addAll(oldModel.getFalseRecommendations()
.stream()
.filter(f -> !newFalseRecommendationsValues.stream()
.map(s -> s.toLowerCase(Locale.ROOT))
.toList().contains(f.getValue()))
.toList());
} else if (!oldModel.isCaseInsensitive() && t.isCaseInsensitive()) {
// add old entries from existing DictionaryModel but exclude upper case representation
entries.addAll(oldModel.getEntries()
.stream()
.filter(f -> !newValues.contains(f.getValue().toLowerCase(Locale.ROOT)))
.toList());
falsePositives.addAll(oldModel.getFalsePositives()
.stream()
.filter(f -> !newFalsePositivesValues.contains(f.getValue().toLowerCase(Locale.ROOT)))
.toList());
falseRecommendations.addAll(oldModel.getFalseRecommendations()
.stream()
.filter(f -> !newFalseRecommendationsValues.contains(f.getValue().toLowerCase(Locale.ROOT)))
.toList());
} else {
// add old entries from existing DictionaryModel
entries.addAll(oldModel.getEntries()
.stream()
.filter(f -> !newValues.contains(f.getValue()))
.toList());
falsePositives.addAll(oldModel.getFalsePositives()
.stream()
.filter(f -> !newFalsePositivesValues.contains(f.getValue()))
.toList());
falseRecommendations.addAll(oldModel.getFalseRecommendations()
.stream()
.filter(f -> !newFalseRecommendationsValues.contains(f.getValue()))
.toList());
}
}
// Add Increments
entries.addAll(newEntries.getEntries());
falsePositives.addAll(newEntries.getFalsePositives());
falseRecommendations.addAll(newEntries.getFalseRecommendations());
return new DictionaryModel(t.getType(),
t.getRank(),
convertColor(t.getHexColor()),
t.isCaseInsensitive(),
t.isHint(),
entries,
falsePositives,
falseRecommendations,
dossierId != null);
})
.map(t -> mapTypeToDictionaryModel(tenantId, t, dossierTemplateId, dossierId))
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
.collect(Collectors.toList());
dictionary.forEach(dm -> dictionaryRepresentation.getLocalAccessMap().put(dm.getType(), dm));
Colors colors = dictionaryClient.getColors(dossierTemplateId);
dictionaryRepresentation.setDefaultColor(convertColor(DEFAULT_COLOR));
dictionaryRepresentation.setRequestAddColor(convertColor(colors.getRequestAddColor()));
dictionaryRepresentation.setRequestRemoveColor(convertColor(colors.getRequestRemoveColor()));
dictionaryRepresentation.setNotRedactedColor(convertColor(colors.getSkippedColor()));
dictionaryRepresentation.setDossierTemplateId(dossierTemplateId);
dictionaryRepresentation.setDictionaryVersion(version);
dictionaryRepresentation.setDictionary(dictionary);
if (dossierId == null) {
addDictionaryRepresentationForDossierTemplate(dossierTemplateId, dictionaryRepresentation);
dictionaryCacheService.addDictionaryRepresentationForDossierTemplate(dossierTemplateId, dictionaryRepresentation);
} else {
addDictionaryRepresentationForDossier(dossierId, dictionaryRepresentation);
dictionaryCacheService.addDictionaryRepresentationForDossier(dossierId, dictionaryRepresentation);
}
}
} catch (FeignException e) {
log.warn("Got some unknown feignException", e);
log.warn("Got some unknown FeignException", e);
throw e;
}
}
private DictionaryModel mapTypeToDictionaryModel(String tenantId, Type type, String dossierTemplateId, String dossierId) {
TenantContext.setTenantId(tenantId);
Optional<DictionaryModel> optionalOldModel = getExistingDictionaryModel(type.getType(), dossierTemplateId, dossierId);
DictionaryEntries newEntries = mapEntries(type);
Set<DictionaryEntryModel> combinedEntries = new HashSet<>(newEntries.getEntries());
Set<DictionaryEntryModel> combinedFalsePositives = new HashSet<>(newEntries.getFalsePositives());
Set<DictionaryEntryModel> combinedFalseRecommendations = new HashSet<>(newEntries.getFalseRecommendations());
optionalOldModel.ifPresent(oldModel -> oldModel.handleOldEntries(type, newEntries, combinedEntries, combinedFalsePositives, combinedFalseRecommendations));
combinedEntries.addAll(newEntries.getEntries());
combinedFalsePositives.addAll(newEntries.getFalsePositives());
combinedFalseRecommendations.addAll(newEntries.getFalseRecommendations());
return new DictionaryModel(type.getType(),
type.getRank(),
null,
type.isCaseInsensitive(),
type.isHint(),
combinedEntries,
combinedFalsePositives,
combinedFalseRecommendations,
dossierId != null);
}
private Optional<DictionaryModel> getExistingDictionaryModel(String type, String dossierTemplateId, String dossierId) {
DictionaryRepresentation representation = dossierId == null ? //
dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId) : dictionaryCacheService.getDossierDictionary(dossierId);
return representation != null ? representation.getDictionary()
.stream()
.filter(f -> f.getType().equals(type))
.findAny() : Optional.empty();
}
private DictionaryEntries mapEntries(Type type) {
Set<DictionaryEntryModel> entries = type.getEntries() != null ? new HashSet<>(type.getEntries()
.stream()
.map(DictionaryEntryModel::new)
.collect(Collectors.toSet())) : new HashSet<>();
Set<DictionaryEntryModel> falsePositives = type.getFalsePositiveEntries() != null ? new HashSet<>(type.getFalsePositiveEntries()
.stream()
.map(DictionaryEntryModel::new)
.collect(Collectors.toSet())) : new HashSet<>();
Set<DictionaryEntryModel> falseRecommendations = type.getFalseRecommendationEntries() != null ? new HashSet<>(type.getFalseRecommendationEntries()
.stream()
.map(DictionaryEntryModel::new)
.collect(Collectors.toSet())) : new HashSet<>();
Set<DictionaryEntryModel> entries = type.getEntries() != null ? type.getEntries()
.stream()
.map(DictionaryEntryModel::new)
.collect(Collectors.toSet()) : new HashSet<>();
Set<DictionaryEntryModel> falsePositives = type.getFalsePositiveEntries() != null ? type.getFalsePositiveEntries()
.stream()
.map(DictionaryEntryModel::new)
.collect(Collectors.toSet()) : new HashSet<>();
Set<DictionaryEntryModel> falseRecommendations = type.getFalseRecommendationEntries() != null ? type.getFalseRecommendationEntries()
.stream()
.map(DictionaryEntryModel::new)
.collect(Collectors.toSet()) : new HashSet<>();
if (type.isCaseInsensitive()) {
entries.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
falsePositives.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
falseRecommendations.forEach(entry -> entry.setValue(entry.getValue().toLowerCase(Locale.ROOT)));
}
log.debug("Dictionary update returned {} entries {} falsePositives and {} falseRecommendations for type {}",
log.debug("Dictionary update returned {} entries, {} falsePositives, and {} falseRecommendations for type {}",
entries.size(),
falsePositives.size(),
falseRecommendations.size(),
entries);
type.getType());
return new DictionaryEntries(entries, falsePositives, falseRecommendations);
}
private float[] convertColor(String hex) {
Color color = Color.decode(hex);
return new float[]{color.getRed() / 255f, color.getGreen() / 255f, color.getBlue() / 255f};
}
@SneakyThrows
public float[] getColor(String type, String dossierTemplateId) {
DictionaryModel model = getDossierTemplateDictionary(dossierTemplateId).getLocalAccessMap()
.get(type);
if (model != null) {
return model.getColor();
}
return getDossierTemplateDictionary(dossierTemplateId).getDefaultColor();
}
@SneakyThrows
public boolean isHint(String type, String dossierTemplateId) {
DictionaryModel model = getDossierTemplateDictionary(dossierTemplateId).getLocalAccessMap()
DictionaryModel model = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId).getLocalAccessMap()
.get(type);
if (model != null) {
return model.isHint();
}
return false;
return model != null && model.isHint();
}
@SneakyThrows
@Timed("redactmanager_getDeepCopyDictionary")
@Observed(name = "DictionaryService", contextualName = "deep-copy-dictionary")
public Dictionary getDeepCopyDictionary(String dossierTemplateId, String dossierId) {
private Dictionary buildDictionary(String dossierTemplateId, String dossierId, DictionaryVersion dictionaryVersion) {
List<DictionaryModel> mergedDictionaries = new LinkedList<>();
DictionaryRepresentation dossierTemplateRepresentation = getDossierTemplateDictionary(dossierTemplateId);
List<DictionaryModel> dossierTemplateDictionaries = dossierTemplateRepresentation.getDictionary();
dossierTemplateDictionaries.forEach(dm -> mergedDictionaries.add(SerializationUtils.clone(dm)));
// add dossier
long dossierDictionaryVersion = -1;
if (dossierDictionaryExists(dossierId)) {
DictionaryRepresentation dossierRepresentation = getDossierDictionary(dossierId);
List<DictionaryModel> dossierDictionaries = dossierRepresentation.getDictionary();
dossierDictionaries.forEach(dm -> mergedDictionaries.add(SerializationUtils.clone(dm)));
return getDictionary(mergedDictionaries, dossierTemplateRepresentation, dossierRepresentation.getDictionaryVersion());
} else {
return getDictionary(mergedDictionaries, dossierTemplateRepresentation, dossierDictionaryVersion);
DictionaryRepresentation templateDictRep = dictionaryCacheService.getDossierTemplateDictionary(dossierTemplateId);
if (templateDictRep != null) {
templateDictRep.getDictionary()
.forEach(dm -> mergedDictionaries.add(dm.clone()));
}
if (dossierDictionaryExists(dossierId)) {
DictionaryRepresentation dossierDictRep = dictionaryCacheService.getDossierDictionary(dossierId);
if (dossierDictRep != null) {
dossierDictRep.getDictionary()
.forEach(dm -> mergedDictionaries.add(dm.clone()));
}
}
return dictionaryFactory.create(mergedDictionaries.stream()
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
.collect(Collectors.toList()), dictionaryVersion);
}
private Dictionary getDictionary(List<DictionaryModel> mergedDictionaries, DictionaryRepresentation dossierTemplateRepresentation, long dossierDictionaryVersion) {
private boolean checkIfDictionaryIsUpToDate(String dossierTemplateId, String dossierId, Dictionary cachedDictionary) {
return new Dictionary(mergedDictionaries.stream()
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
.collect(Collectors.toList()),
DictionaryVersion.builder()
.dossierTemplateVersion(dossierTemplateRepresentation.getDictionaryVersion())
.dossierVersion(dossierDictionaryVersion)
.build());
}
long latestTemplateVersion = dictionaryClient.getVersion(dossierTemplateId);
long latestDossierVersion = dictionaryClient.getVersionForDossier(dossierId);
DictionaryVersion cachedVersion = cachedDictionary.getVersion();
@SneakyThrows
public float[] getNotRedactedColor(String dossierTemplateId) {
return getDossierTemplateDictionary(dossierTemplateId).getNotRedactedColor();
}
@SneakyThrows
private void addDictionaryRepresentationForDossierTemplate(String dossierTemplateId, DictionaryRepresentation dictionaryRepresentation) {
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate().put(dossierTemplateId, dictionaryRepresentation);
}
@SneakyThrows
private void addDictionaryRepresentationForDossier(String dossierId, DictionaryRepresentation dictionaryRepresentation) {
tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier().put(dossierId, dictionaryRepresentation);
}
@SneakyThrows
private DictionaryRepresentation getDossierTemplateDictionary(String dossierTemplateId) {
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossierTemplate()
.get(dossierTemplateId);
}
@SneakyThrows
private DictionaryRepresentation getDossierDictionary(String dossierId) {
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier()
.get(dossierId);
return (cachedVersion.getDossierTemplateVersion() >= latestTemplateVersion) && (cachedVersion.getDossierVersion() >= latestDossierVersion);
}
@SneakyThrows
private boolean dossierDictionaryExists(String dossierId) {
return tenantDictionaryCache.get(TenantContext.getTenantId()).getDictionariesByDossier().containsKey(dossierId);
}
private Long getVersion(DictionaryRepresentation dictionaryRepresentation) {
if (dictionaryRepresentation == null) {
return null;
} else {
return dictionaryRepresentation.getDictionaryVersion();
}
}
private List<CommonsDictionaryModel> convertDictionaryModel(List<DictionaryModel> dictionaries) {
return dictionaries.stream()
.map(d -> CommonsDictionaryModel.builder()
.type(d.getType())
.rank(d.getRank())
.color(d.getColor())
.caseInsensitive(d.isCaseInsensitive())
.hint(d.isHint())
.isDossierDictionary(d.isDossierDictionary())
.entries(d.getEntries())
.falsePositives(d.getFalsePositives())
.falseRecommendations(d.getFalseRecommendations())
.build())
.collect(Collectors.toList());
}
private List<DictionaryModel> convertCommonsDictionaryModel(List<CommonsDictionaryModel> commonsDictionaries) {
return commonsDictionaries.stream()
.map(cd -> new DictionaryModel(cd.getType(),
cd.getRank(),
cd.getColor(),
cd.isCaseInsensitive(),
cd.isHint(),
cd.getEntries(),
cd.getFalsePositives(),
cd.getFalseRecommendations(),
cd.isDossierDictionary()))
.collect(Collectors.toList());
}
public List<Type> getAllTypes(String dossierTemplateId, String dossierId) {
List<Type> allTypes = dictionaryClient.getAllTypesForDossierTemplate(dossierTemplateId, null, false);
allTypes.addAll(dictionaryClient.getAllTypesForDossier(dossierId, null, false));
return allTypes;
DictionaryRepresentation dossierDictRep = dictionaryCacheService.getDossierDictionary(dossierId);
return dossierDictRep != null;
}
}

View File

@ -265,7 +265,7 @@ public class RedactionStorageService {
// And the cache eviction logic when a file changes after e.g. ocr is not implemented yet.
// See https://knecon.atlassian.net/jira/software/c/projects/RED/boards/37?selectedIssue=RED-8106.
@Timed("redactmanager_getDocumentGraph")
@Cacheable(value = "documentDataCache")
//@Cacheable(value = "documentDataCache")
public DocumentData getDocumentData(String dossierId, String fileId) {
try {

View File

@ -67,6 +67,7 @@ import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.service.DictionaryCacheService;
import com.iqser.red.service.redaction.v1.server.service.DocumentSearchService;
import com.iqser.red.service.redaction.v1.server.service.UnprocessedChangesService;
import com.iqser.red.service.redaction.v1.server.service.websocket.RedisSyncedWebSocketService;
@ -207,6 +208,10 @@ public abstract class AbstractRedactionIntegrationTest {
@Autowired
protected TenantMongoLiquibaseExecutor tenantMongoLiquibaseExecutor;
@Autowired
DictionaryCacheService dictionaryCacheService;
protected final Map<String, List<String>> dictionary = new HashMap<>();
protected final Map<String, List<String>> dossierDictionary = new HashMap<>();
protected final Map<String, List<String>> falsePositive = new HashMap<>();
@ -271,6 +276,7 @@ public abstract class AbstractRedactionIntegrationTest {
}
entityLogDocumentRepository.deleteAll();
entityLogEntryDocumentRepository.deleteAll();
dictionaryCacheService.clearAllCaches();
}

View File

@ -62,6 +62,7 @@ import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryFactory;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryVersion;
@ -148,6 +149,8 @@ import lombok.extern.slf4j.Slf4j;
private MongoConnectionProvider mongoConnectionProvider;
@MockBean
private TenantProvider tenantProvider;
@Autowired
private DictionaryFactory dictionaryFactory;
@Test
@ -250,17 +253,12 @@ import lombok.extern.slf4j.Slf4j;
testDossierTemplate = new TestDossierTemplate(dossierTemplateToUse);
when(dictionaryService.updateDictionary(any(), any())).thenReturn(new DictionaryVersion(0, 0));
when(dictionaryService.getDeepCopyDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary);
when(dictionaryService.getDictionary(any(), any())).thenReturn(testDossierTemplate.testDictionary);
when(dictionaryService.getDictionaryIncrements(any(), any(), any())).thenReturn(new DictionaryIncrement(Collections.emptySet(), new DictionaryVersion(0, 0)));
when(dictionaryService.isHint(any(String.class), any())).thenAnswer(invocation -> {
String type = invocation.getArgument(0);
return testDossierTemplate.testDictionary.isHint(type);
});
when(dictionaryService.getColor(any(String.class), any())).thenAnswer(invocation -> {
String type = invocation.getArgument(0);
return testDossierTemplate.testDictionary.getType(type).getColor();
});
when(dictionaryService.getNotRedactedColor(any())).thenReturn(new float[]{0.2f, 0.2f, 0.2f});
when(rulesClient.getVersion(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(System.currentTimeMillis());
when(rulesClient.getRules(testDossierTemplate.id, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(testDossierTemplate.rules));
@ -422,7 +420,7 @@ import lombok.extern.slf4j.Slf4j;
componentRules = new String(Files.readAllBytes(componentRuleFile.toPath()));
}
testDictionary = new Dictionary(dictionaries, new DictionaryVersion(0, 0));
testDictionary = dictionaryFactory.create(dictionaries, new DictionaryVersion(0, 0));
}

View File

@ -225,14 +225,14 @@ public class DictionaryServiceTest {
when(dictionaryClient.getDictionaryForType("dossierType", 0L)).thenReturn(dossierType);
dictionaryService.updateDictionary("dtId", "dossierId");
var dict = dictionaryService.getDeepCopyDictionary("dtId", "dossierId");
assertThat(dict.getDictionaryModels().size()).isEqualTo(2);
var dict = dictionaryService.getDictionary("dtId", "dossierId");
assertThat(dict.getDictionaryModels().size()).isEqualTo(1);
var dictModel = dict.getDictionaryModels()
.get(0);
assertThat(dictModel.getType()).isEqualTo(type);
assertThat(dictModel.getEntries().size()).isEqualTo(3);
dictModel.getEntries()
.forEach(entry -> assertThat(entry.getTypeId()).isEqualTo(dtType.getTypeId()));
.forEach(entry -> assertThat(entry.getTypeId()).isEqualTo("dossierType"));
}
}

View File

@ -38,6 +38,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemp
import com.iqser.red.service.redaction.v1.server.annotate.AnnotateRequest;
import com.iqser.red.service.redaction.v1.server.annotate.AnnotateResponse;
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
import com.iqser.red.service.redaction.v1.server.service.DictionaryCacheService;
import com.iqser.red.service.redaction.v1.server.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -54,6 +55,9 @@ public class RedactionAcceptanceTest extends AbstractRedactionIntegrationTest {
@Autowired
DictionaryService dictionaryService;
@Autowired
DictionaryCacheService dictionaryCacheService;
@BeforeEach
public void stubClients() {
@ -105,7 +109,7 @@ public class RedactionAcceptanceTest extends AbstractRedactionIntegrationTest {
String EFSA_SANITISATION_RULES = loadFromClassPath("drools/efsa_sanitisation.drl");
when(rulesClient.getRules(TEST_DOSSIER_TEMPLATE_ID, RuleFileType.ENTITY)).thenReturn(JSONPrimitive.of(EFSA_SANITISATION_RULES));
dossierDictionary.put(PUBLISHED_INFORMATION_INDICATOR, new ArrayList<>());
dictionaryService.clearTenantDictionaryCache();
dictionaryCacheService.clearAllCaches();
AnalyzeRequest request = uploadFileToStorage("files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf");
System.out.println("Start Full integration test");
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);

View File

@ -0,0 +1,253 @@
package com.iqser.red.service.redaction.v1.server.document.graph;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
import com.iqser.red.service.redaction.v1.server.model.dictionary.*;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
public class DictionarySearchImplementationsTest {
private static final int LARGE_TEXT_REPETITIONS = 50_000;
private static final int MAX_PII_ENTRY_COUNT = 500_000;
private static final String LARGE_TEXT_SAMPLE = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
+ "Entity_1 match text. Recommendation_1 also here. Random text continues. ";
// Dictionary identifiers
protected static final String VERTEBRATE_INDICATOR = "vertebrate";
protected static final String DICTIONARY_ADDRESS = "CBI_address";
protected static final String DICTIONARY_AUTHOR = "CBI_author";
protected static final String DICTIONARY_SPONSOR = "CBI_sponsor";
protected static final String DICTIONARY_PII = "PII";
protected static final String NO_REDACTION_INDICATOR = "no_redaction_indicator";
protected static final String REDACTION_INDICATOR = "redaction_indicator";
protected static final String HINT_ONLY_INDICATOR = "hint_only";
protected static final String MUST_REDACT_INDICATOR = "must_redact";
protected static final String PUBLISHED_INFORMATION_INDICATOR = "published_information";
protected static final String TEST_METHOD_INDICATOR = "test_method";
protected static final String PURITY_INDICATOR = "purity";
@Test
public void performanceTestWithRealDictionaries() {
// Load dictionaries from files
Map<String, List<String>> loadedDictionaries = loadDictionaries();
Map<DictionaryIdentifier, List<String>> dictionaryValues = new HashMap<>();
Random random = new Random();
// Randomly assign EntityType.ENTITY or EntityType.RECOMMENDATION to dictionaries
for (Map.Entry<String, List<String>> entry : loadedDictionaries.entrySet()) {
String dictionaryName = entry.getKey();
List<String> dictionaryTerms = entry.getValue();
EntityType entityType = random.nextBoolean() ? EntityType.ENTITY : EntityType.RECOMMENDATION;
boolean caseSensitive = random.nextBoolean();
DictionaryIdentifier identifier = new DictionaryIdentifier(dictionaryName, entityType, true, caseSensitive);
dictionaryValues.put(identifier, dictionaryTerms);
}
// **Added dummy dictionaries as per request**
// Case-sensitive dictionary containing "Entity_1"
DictionaryIdentifier entity1Identifier = new DictionaryIdentifier("dummy_case_sensitive", EntityType.ENTITY, true, true // Case-sensitive
);
dictionaryValues.put(entity1Identifier, List.of("Entity_1"));
// Case-insensitive dictionary containing "recommendation_1"
DictionaryIdentifier recommendation1Identifier = new DictionaryIdentifier("dummy_case_insensitive", EntityType.RECOMMENDATION, true, false // Case-insensitive
);
dictionaryValues.put(recommendation1Identifier, List.of("recommendation_1"));
// Measure construction time for TrieDictionarySearch
long trieDictionaryConstructionStart = System.currentTimeMillis();
DoubleTrieDictionarySearch doubleTrieDictionarySearchImpl = new DoubleTrieDictionarySearch(dictionaryValues);
long trieDictionaryConstructionDuration = System.currentTimeMillis() - trieDictionaryConstructionStart;
// Measure construction time for AnotherTrieDictionarySearch
long anotherTrieConstructionStart = System.currentTimeMillis();
Map<String, List<DictionaryIdentifierWithKeyword>> keyWordToIdentifiersMap = computeStringIdentifiersMap(dictionaryValues);
AhoCorasickMapDictionarySearch ahoCorasickMapDictionarySearchImpl = new AhoCorasickMapDictionarySearch(keyWordToIdentifiersMap);
long anotherTrieConstructionDuration = System.currentTimeMillis() - anotherTrieConstructionStart;
// Measure construction time for SearchImplementations
long searchTrieConstructionStart = System.currentTimeMillis();
List<SearchImplementation> searchImplementations = dictionaryValues.entrySet()
.stream()
.map(entry -> new SearchImplementation(entry.getValue(), !entry.getKey().caseSensitive()))
.toList();
long searchTrieConstructionDuration = System.currentTimeMillis() - searchTrieConstructionStart;
// Measure construction time for DoubleArrayTrieDictionarySearch
long doubleArrayTrieConstructionStart = System.currentTimeMillis();
DoubleArrayTrieDictionarySearch doubleArrayTrieSearchImpl = new DoubleArrayTrieDictionarySearch(keyWordToIdentifiersMap);
long doubleArrayTrieConstructionDuration = System.currentTimeMillis() - doubleArrayTrieConstructionStart;
String largeText = LARGE_TEXT_SAMPLE.repeat(LARGE_TEXT_REPETITIONS);
// Measure search time for TrieDictionarySearch
long trieDictionarySearchStart = System.currentTimeMillis();
List<DoubleTrieDictionarySearch.MatchTextRange> trieDictionaryMatches = doubleTrieDictionarySearchImpl.getBoundariesAsList(largeText);
long trieDictionarySearchDuration = System.currentTimeMillis() - trieDictionarySearchStart;
// Measure search time for AnotherTrieDictionarySearch
long anotherTrieSearchStart = System.currentTimeMillis();
List<DictionarySearch.MatchTextRange> anotherTrieMatches = ahoCorasickMapDictionarySearchImpl.getBoundaries(largeText)
.toList();
long anotherTrieSearchDuration = System.currentTimeMillis() - anotherTrieSearchStart;
// Measure search time for SearchImplementations
long searchImplStart = System.currentTimeMillis();
List<TextRange> searchMatches = new ArrayList<>();
for (SearchImplementation searchImpl : searchImplementations) {
searchMatches.addAll(searchImpl.getBoundaries(largeText));
}
long searchImplDuration = System.currentTimeMillis() - searchImplStart;
// Measure search time for DoubleArrayTrieDictionarySearch
long doubleArrayTrieSearchStart = System.currentTimeMillis();
List<DictionarySearch.MatchTextRange> doubleArrayTrieMatches = doubleArrayTrieSearchImpl.getBoundariesAsList(largeText);
long doubleArrayTrieSearchDuration = System.currentTimeMillis() - doubleArrayTrieSearchStart;
// Output the performance results
System.out.println("\nTotal number of keywords is: " + keyWordToIdentifiersMap.size());
System.out.printf("DoubleTrieDictionarySearch construction took %d ms%n", trieDictionaryConstructionDuration);
System.out.printf("DoubleTrieDictionarySearch search took %d ms and found %d matches%n", trieDictionarySearchDuration, trieDictionaryMatches.size());
System.out.println();
System.out.printf("AhoCorasickMapDictionarySearch construction took %d ms%n", anotherTrieConstructionDuration);
System.out.printf("AhoCorasickMapDictionarySearch search took %d ms and found %d matches%n", anotherTrieSearchDuration, anotherTrieMatches.size());
System.out.println();
System.out.printf("Multiple Tries construction took %d ms%n", searchTrieConstructionDuration);
System.out.printf("Combined SearchImplementations search took %d ms and found %d matches%n", searchImplDuration, searchMatches.size());
System.out.println();
System.out.printf("DoubleArrayTrieDictionarySearch construction took %d ms%n", doubleArrayTrieConstructionDuration);
System.out.printf("DoubleArrayTrieDictionarySearch search took %d ms and found %d matches%n", doubleArrayTrieSearchDuration, doubleArrayTrieMatches.size());
System.out.println();
// Assert that all implementations found matches
assert !trieDictionaryMatches.isEmpty()
&& !anotherTrieMatches.isEmpty()
&& !searchMatches.isEmpty()
&& !doubleArrayTrieMatches.isEmpty() : "All implementations should find entities.";
// Ensure all implementations found the same number of matches
int expectedMatches = trieDictionaryMatches.size();
assertEquals(expectedMatches, anotherTrieMatches.size(), "Mismatch between DoubleTrieDictionarySearch and AhoCorasickMapDictionarySearch");
assertEquals(expectedMatches, searchMatches.size(), "Mismatch between DoubleTrieDictionarySearch and Combined SearchImplementations");
assertEquals(expectedMatches, doubleArrayTrieMatches.size(), "Mismatch between DoubleTrieDictionarySearch and DoubleArrayTrieDictionarySearch");
}
private Map<String, List<String>> loadDictionaries() {
Map<String, List<String>> dictionaries = new HashMap<>();
dictionaries.put(DICTIONARY_AUTHOR, loadDictionaryFromFile("dictionaries/CBI_author.txt"));
dictionaries.put(DICTIONARY_SPONSOR, loadDictionaryFromFile("dictionaries/CBI_sponsor.txt"));
dictionaries.put(VERTEBRATE_INDICATOR, loadDictionaryFromFile("dictionaries/vertebrate.txt"));
dictionaries.put(DICTIONARY_ADDRESS, loadDictionaryFromFile("dictionaries/CBI_address.txt"));
dictionaries.put(NO_REDACTION_INDICATOR, loadDictionaryFromFile("dictionaries/no_redaction_indicator.txt"));
dictionaries.put(REDACTION_INDICATOR, loadDictionaryFromFile("dictionaries/redaction_indicator.txt"));
dictionaries.put(HINT_ONLY_INDICATOR, loadDictionaryFromFile("dictionaries/hint_only.txt"));
dictionaries.put(MUST_REDACT_INDICATOR, loadDictionaryFromFile("dictionaries/must_redact.txt"));
dictionaries.put(PUBLISHED_INFORMATION_INDICATOR, loadDictionaryFromFile("dictionaries/published_information.txt"));
dictionaries.put(TEST_METHOD_INDICATOR, loadDictionaryFromFile("dictionaries/test_method.txt"));
List<String> piis = loadDictionaryFromFile("dictionaries/PII_large.txt");
dictionaries.put(DICTIONARY_PII, MAX_PII_ENTRY_COUNT < piis.size() ? piis.subList(0, MAX_PII_ENTRY_COUNT) : piis);
dictionaries.put(PURITY_INDICATOR, loadDictionaryFromFile("dictionaries/purity.txt"));
return dictionaries;
}
private List<String> loadDictionaryFromFile(String filePath) {
List<String> terms = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader()
.getResourceAsStream(filePath))))) {
terms = reader.lines()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toList());
} catch (Exception e) {
System.err.println("Failed to load dictionary from " + filePath + ": " + e.getMessage());
}
return terms;
}
private String cleanDictionaryEntry(String entry) {
return entry.trim();
}
private static Map<String, List<DictionaryIdentifierWithKeyword>> computeStringIdentifiersMap(Map<DictionaryIdentifier, List<String>> dictionaryValues) {
Map<String, List<DictionaryIdentifierWithKeyword>> stringToIdentifiersMap = new HashMap<>();
for (Map.Entry<DictionaryIdentifier, List<String>> entry : dictionaryValues.entrySet()) {
DictionaryIdentifier identifier = entry.getKey();
List<String> values = entry.getValue();
for (String value : values) {
DictionaryIdentifierWithKeyword idWithKeyword = new DictionaryIdentifierWithKeyword(identifier, value);
stringToIdentifiersMap.computeIfAbsent(value.toLowerCase(Locale.ROOT), k -> new ArrayList<>()).add(idWithKeyword);
}
}
return stringToIdentifiersMap;
}
@Test
public void testMultiplePayloads() {
DoubleTrieDictionarySearch dictionarySearchImpl = new DoubleTrieDictionarySearch(Map.of(new DictionaryIdentifier("type1", EntityType.ENTITY, false, false),
List.of("apple", "banana"),
new DictionaryIdentifier("type2", EntityType.RECOMMENDATION, false, false),
List.of("apple", "orange"),
new DictionaryIdentifier("type3", EntityType.FALSE_POSITIVE, false, false),
List.of("apple", "kiwi")));
List<DoubleTrieDictionarySearch.MatchTextRange> dictionaryMatches = dictionarySearchImpl.getBoundariesAsList(
"an apple is delicious, a banana and a kiwi as well. orange is a color.");
assertEquals(dictionaryMatches.size(), 6);
}
@Test
public void testDoubleArrayTrie() {
Map<String, List<String>> map = new HashMap<>();
String[] keyArray = new String[]{"hers", "his", "she", "he"};
for (String key : keyArray) {
map.put(key, List.of(key, key, key));
}
AhoCorasickDoubleArrayTrie<List<String>> acdat = new AhoCorasickDoubleArrayTrie<>();
acdat.build(map);
final String text = "uhers";
List<AhoCorasickDoubleArrayTrie.Hit<List<String>>> wordList = acdat.parseText(text);
assertEquals(wordList.size(), 2);
}
}

View File

@ -28,9 +28,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSON
import com.iqser.red.service.redaction.v1.server.logger.Context;
import com.iqser.red.service.redaction.v1.server.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.model.dictionary.Dictionary;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.dictionary.DictionarySearch;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
@ -99,15 +97,11 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
Document document = buildGraph(filename);
dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
Dictionary dictionary = dictionaryService.getDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
long dictionarySearchStart = System.currentTimeMillis();
List<TextEntity> foundEntities = new LinkedList<>();
for (DictionaryModel model : dictionary.getDictionaryModels()) {
findEntitiesWithSearchImplementation(document, model.getEntriesSearch(), EntityType.ENTITY, foundEntities, model.getType());
findEntitiesWithSearchImplementation(document, model.getFalsePositiveSearch(), EntityType.FALSE_POSITIVE, foundEntities, model.getType());
findEntitiesWithSearchImplementation(document, model.getFalseRecommendationsSearch(), EntityType.FALSE_RECOMMENDATION, foundEntities, model.getType());
}
findEntitiesWithSearchImplementation(document, dictionary.getDictionarySearch(), foundEntities);
System.out.printf("Dictionary search took %d ms and found %d entities\n", System.currentTimeMillis() - dictionarySearchStart, foundEntities.size());
long graphInsertionStart = System.currentTimeMillis();
@ -174,7 +168,7 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
Document document = buildGraph(filename);
dictionaryService.updateDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
Dictionary dictionary = dictionaryService.getDictionary(TEST_DOSSIER_TEMPLATE_ID, TEST_DOSSIER_ID);
int numberOfRuns = 1;
float totalSearchTime = 0;
@ -190,11 +184,7 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
totalGraphTime += graphTime;
var searchStart = System.currentTimeMillis();
for (var model : dictionary.getDictionaryModels()) {
findEntitiesWithSearchImplementation(document, model.getEntriesSearch(), EntityType.ENTITY, foundEntities, model.getType());
findEntitiesWithSearchImplementation(document, model.getFalsePositiveSearch(), EntityType.FALSE_POSITIVE, foundEntities, model.getType());
findEntitiesWithSearchImplementation(document, model.getFalseRecommendationsSearch(), EntityType.FALSE_RECOMMENDATION, foundEntities, model.getType());
}
findEntitiesWithSearchImplementation(document, dictionary.getDictionarySearch(), foundEntities);
var searchTime = System.currentTimeMillis() - searchStart;
totalSearchTime += searchTime;
@ -272,16 +262,12 @@ public class DocumentPerformanceIntegrationTest extends RulesIntegrationTest {
}
private void findEntitiesWithSearchImplementation(Document document,
SearchImplementation searchImplementation,
EntityType entityType,
List<TextEntity> foundEntities,
String type) {
private void findEntitiesWithSearchImplementation(Document document, DictionarySearch dictionarySearch, List<TextEntity> foundEntities) {
TextBlock textBlock = document.getTextBlock();
searchImplementation.getBoundaries(textBlock)
.filter(boundary -> boundaryIsSurroundedBySeparators(textBlock, boundary))
.map(bounds -> TextEntity.initialEntityNode(bounds, type, entityType, document))
dictionarySearch.getBoundaries(textBlock)
.filter(match -> boundaryIsSurroundedBySeparators(textBlock, match.textRange()))
.map(match -> TextEntity.initialEntityNode(match.textRange(), match.identifier().type(), match.identifier().entityType(), document))
.forEach(foundEntities::add);
}

View File

@ -59,7 +59,6 @@ public class PrecursorEntityTest extends BuildDocumentIntegrationTest {
public void stubMethods() {
MockitoAnnotations.openMocks(this);
when(dictionaryService.getColor(DICTIONARY_AUTHOR, TEST_DOSSIER_TEMPLATE_ID)).thenReturn(new float[]{0f, 0f, 0f});
when(dictionaryService.isHint(any(), any())).thenReturn(false);
}

View File

@ -218,7 +218,7 @@ public class LiveDataIntegrationTest {
dictionaryService.updateDictionary("dossierTemplateId", "dossierId");
var dict = dictionaryService.getDeepCopyDictionary("dossierTemplateId", "dossierId");
var dict = dictionaryService.getDictionary("dossierTemplateId", "dossierId");
assertThat(dict.getLocalAccessMap().size()).isEqualTo(12);
}