RED-106: replace the local dictionary preload with remove dictionary service.

This commit is contained in:
cheng 2020-07-17 18:08:51 +02:00
parent fe5c20e1a0
commit 8ed548f1a8
6 changed files with 44 additions and 38 deletions

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.client;
import org.springframework.cloud.openfeign.FeignClient;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import com.iqser.red.service.configuration.v1.api.resource.RulesResource;
@FeignClient(name = RulesResource.SERVICE_NAME, url = "http://" + RulesResource.SERVICE_NAME + ":8080")
public interface DictionaryClient extends DictionaryResource {
}

View File

@ -1,58 +1,57 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.PostConstruct;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import feign.FeignException;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@Slf4j
public class DictionaryService {
public static final String VERTEBRATES_CODE = "VERTEBRATE";
public static final String ADDRESS_CODE = "ADDRESS";
public static final String NAME_CODE = "NAME";
public static final String NO_REDACTION_INDICATOR = "NO_REDACTION_INDICATOR";
private final DictionaryClient dictionaryClient;
private long dictionaryVersion = -1;
@Getter
private Map<String, Set<String>> dictionary = new HashMap<>();
@Getter
private long generation;
@PostConstruct
public void init() {
loadFromResourceFiles();
}
private Map<String, Set<String>> dictionaryEntry = new HashMap<>();
public void updateDictionary() {
//TODO
long version = 1; // TODO = dictionaryClient.getVersion();
if (version > dictionaryVersion) {
dictionaryVersion = version;
dictionaryEntry = retrieveDictionaryEntry();
}
}
public void loadFromResourceFiles() {
dictionary.computeIfAbsent(NAME_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/names.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
dictionary.computeIfAbsent(VERTEBRATES_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/vertebrates.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
dictionary.computeIfAbsent(ADDRESS_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/addresses.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/NoRedactionIndicator.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
private Map<String, Set<String>> retrieveDictionaryEntry() {
try {
TypeResponse typeResponse = dictionaryClient.getAllTypes();
if (typeResponse == null || CollectionUtils.isEmpty(typeResponse.getTypes())) {
return Collections.emptyMap();
} else {
List<String> types = typeResponse.getTypes().stream().map(typeResult -> typeResult.getType()).collect(Collectors.toList());
return types.stream().collect(Collectors.toMap(type -> type, s -> dictionaryClient.getDictionaryForType(s).getEntries().stream().collect(Collectors.toSet())));
}
} catch (FeignException e) {
log.warn("Got some unknown feignException", e);
throw e;
}
}
private String cleanDictionaryEntry(String entry) {
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
}
}
}

View File

@ -19,15 +19,16 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Service
@RequiredArgsConstructor
@Slf4j
public class EntityRedactionService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
public void processDocument(Document classifiedDoc) {
dictionaryService.updateDictionary();
@ -98,13 +99,12 @@ public class EntityRedactionService {
});
}
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
String normalizedInputString = searchableText.toString();
Set<Entity> found = new HashSet<>();
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionaryEntry().entrySet()) {
for (String value : entry.getValue()) {
int startIndex;
int stopIndex = 0;
@ -130,7 +130,6 @@ public class EntityRedactionService {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
@ -142,6 +141,4 @@ public class EntityRedactionService {
}
entities.removeAll(wordsToRemove);
}
}