Pull request #6: RED-106: replace the local dictionary preload with remove dictionary service.
Merge in RED/redaction-service from feature/RED-106i to master * commit 'd607ac567d1e07cfc4c7dd4fa581d1881a874537': RED-106: integrationTest fixed. RED-106: integrationTest fixed. RED-106: integrationTest fixed. REd-106: rebase auf master REd-106: rebase auf master DEV: bugfix missing bean REd-106: enable dictionary version RED-106: replace the local dictionary preload with remove dictionary service. RED-106: replace the local dictionary preload with remove dictionary service.
This commit is contained in:
commit
302daff526
@ -39,7 +39,7 @@
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>configuration-service-api-v1</artifactId>
|
||||
<version>1.0.0</version>
|
||||
<version>1.0.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.drools</groupId>
|
||||
|
||||
@ -0,0 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
|
||||
import com.iqser.red.service.configuration.v1.api.resource.RulesResource;
|
||||
|
||||
@FeignClient(name = RulesResource.SERVICE_NAME, url = "http://" + RulesResource.SERVICE_NAME + ":8080")
|
||||
public interface DictionaryClient extends DictionaryResource {
|
||||
}
|
||||
@ -1,58 +1,56 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@Slf4j
|
||||
public class DictionaryService {
|
||||
|
||||
public static final String VERTEBRATES_CODE = "VERTEBRATE";
|
||||
public static final String ADDRESS_CODE = "ADDRESS";
|
||||
public static final String NAME_CODE = "NAME";
|
||||
public static final String NO_REDACTION_INDICATOR = "NO_REDACTION_INDICATOR";
|
||||
private final DictionaryClient dictionaryClient;
|
||||
|
||||
private long dictionaryVersion = -1;
|
||||
|
||||
@Getter
|
||||
private Map<String, Set<String>> dictionary = new HashMap<>();
|
||||
|
||||
@Getter
|
||||
private long generation;
|
||||
|
||||
@PostConstruct
|
||||
public void init() {
|
||||
loadFromResourceFiles();
|
||||
}
|
||||
|
||||
private Map<String, float[]> entryColors = new HashMap<>();
|
||||
|
||||
public void updateDictionary() {
|
||||
//TODO
|
||||
|
||||
long version = dictionaryClient.getVersion();
|
||||
if (version > dictionaryVersion) {
|
||||
dictionaryVersion = version;
|
||||
updateDictionaryEntry();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void loadFromResourceFiles() {
|
||||
dictionary.computeIfAbsent(NAME_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/names.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
|
||||
dictionary.computeIfAbsent(VERTEBRATES_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/vertebrates.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
|
||||
dictionary.computeIfAbsent(ADDRESS_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/addresses.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
|
||||
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/NoRedactionIndicator.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
|
||||
private void updateDictionaryEntry() {
|
||||
try {
|
||||
TypeResponse typeResponse = dictionaryClient.getAllTypes();
|
||||
if (typeResponse != null && !CollectionUtils.isEmpty(typeResponse.getTypes())) {
|
||||
entryColors = typeResponse.getTypes().stream().collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
|
||||
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> dictionaryClient.getDictionaryForType(s).getEntries().stream().collect(Collectors.toSet())));
|
||||
}
|
||||
} catch (FeignException e) {
|
||||
log.warn("Got some unknown feignException", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private String cleanDictionaryEntry(String entry) {
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -62,7 +62,7 @@ public class DroolsExecutionService {
|
||||
KieServices kieServices = KieServices.Factory.get();
|
||||
InputStream input = new ByteArrayInputStream(drlAsString.getBytes(StandardCharsets.UTF_8));
|
||||
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
|
||||
kieFileSystem.write(kieServices.getResources().newInputStreamResource(input));
|
||||
kieFileSystem.write("src/main/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
|
||||
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
|
||||
kieBuilder.buildAll();
|
||||
KieModule kieModule = kieBuilder.getKieModule();
|
||||
|
||||
@ -27,7 +27,6 @@ public class EntityRedactionService {
|
||||
private final DictionaryService dictionaryService;
|
||||
private final DroolsExecutionService droolsExecutionService;
|
||||
|
||||
|
||||
public void processDocument(Document classifiedDoc) {
|
||||
|
||||
dictionaryService.updateDictionary();
|
||||
@ -98,7 +97,6 @@ public class EntityRedactionService {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
|
||||
|
||||
String normalizedInputString = searchableText.toString();
|
||||
@ -130,7 +128,6 @@ public class EntityRedactionService {
|
||||
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’';
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
@ -142,6 +139,4 @@ public class EntityRedactionService {
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,10 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.visualization.service;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.ADDRESS_CODE;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.NAME_CODE;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.NO_REDACTION_INDICATOR;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.VERTEBRATES_CODE;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
@ -29,6 +24,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
@ -40,7 +36,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class AnnotationHighlightService {
|
||||
|
||||
private final DictionaryService dictionaryService;
|
||||
|
||||
public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction) throws IOException {
|
||||
|
||||
@ -177,36 +173,20 @@ public class AnnotationHighlightService {
|
||||
if (!entity.isRedaction()) {
|
||||
return false;
|
||||
}
|
||||
if (entity.getType().equals(ADDRESS_CODE)) {
|
||||
return true;
|
||||
if(entity.getType().equalsIgnoreCase("VERTEBRATE") || entity.getType().equalsIgnoreCase("NO_REDACTION_INDICATOR") ){
|
||||
// TODO in RED-161.
|
||||
return false;
|
||||
}
|
||||
if (entity.getType().equals(NAME_CODE)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
return dictionaryService.getDictionary().keySet().contains(entity.getType());
|
||||
}
|
||||
|
||||
|
||||
private float[] getColor(Entity entity) {
|
||||
if (!entity.isRedaction()) {
|
||||
return new float[]{0.627f, 0.627f, 0.627f};
|
||||
}
|
||||
if (entity.getType().equals(VERTEBRATES_CODE)) {
|
||||
return new float[]{0, 1, 0};
|
||||
}
|
||||
if (entity.getType().equals(ADDRESS_CODE)) {
|
||||
return new float[]{0, 1, 1};
|
||||
}
|
||||
if (entity.getType().equals(NAME_CODE)) {
|
||||
return new float[]{1, 1, 0};
|
||||
}
|
||||
if (entity.getType().equals(NO_REDACTION_INDICATOR)) {
|
||||
return new float[]{1, 0.502f, 0};
|
||||
}
|
||||
return null;
|
||||
return dictionaryService.getEntryColors().get(entity.getType());
|
||||
}
|
||||
|
||||
|
||||
private void visualizeTextBlock(TextBlock textBlock, PDPageContentStream contentStream) throws IOException {
|
||||
|
||||
contentStream.setStrokingColor(Color.LIGHT_GRAY);
|
||||
|
||||
@ -11,6 +11,11 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Before;
|
||||
@ -28,26 +33,44 @@ import org.springframework.boot.test.context.TestConfiguration;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.core.io.ResourceLoader;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
|
||||
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
|
||||
@Ignore
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(webEnvironment = DEFINED_PORT)
|
||||
public class RedactionIntegrationTest {
|
||||
|
||||
public static final String VERTEBRATES_CODE = "VERTEBRATE";
|
||||
public static final String ADDRESS_CODE = "ADDRESS";
|
||||
public static final String NAME_CODE = "NAME";
|
||||
public static final String NO_REDACTION_INDICATOR = "NO_REDACTION_INDICATOR";
|
||||
|
||||
@Autowired
|
||||
private RedactionController redactionController;
|
||||
|
||||
@MockBean
|
||||
private RulesClient rulesClient;
|
||||
|
||||
@MockBean
|
||||
private DictionaryClient dictionaryClient;
|
||||
|
||||
private Map<String, List<String>> dictionary = new HashMap<>();
|
||||
private Map<String, float[]> typeColorMap = new HashMap<>();
|
||||
|
||||
@TestConfiguration
|
||||
public static class RedactionIntegrationTestConfiguration {
|
||||
|
||||
@ -64,9 +87,7 @@ public class RedactionIntegrationTest {
|
||||
KieModule kieModule = kieBuilder.getKieModule();
|
||||
|
||||
return kieServices.newKieContainer(kieModule.getReleaseId());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Before
|
||||
@ -75,6 +96,40 @@ public class RedactionIntegrationTest {
|
||||
when(rulesClient.getVersion()).thenReturn(0L);
|
||||
when(rulesClient.getRules()).thenReturn(new RulesResponse(loadFromClassPath("drools/rules.drl")));
|
||||
|
||||
loadDictionaryForTest();
|
||||
loadTypeForTest();
|
||||
when(dictionaryClient.getVersion()).thenReturn(0L);
|
||||
when(dictionaryClient.getAllTypes()).thenReturn(TypeResponse.builder().types(getTypeResponse()).build());
|
||||
when(dictionaryClient.getDictionaryForType(VERTEBRATES_CODE)).thenReturn(getDictionaryResponse(VERTEBRATES_CODE));
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(getDictionaryResponse(ADDRESS_CODE));
|
||||
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(getDictionaryResponse(NAME_CODE));
|
||||
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR));
|
||||
}
|
||||
|
||||
private void loadDictionaryForTest() {
|
||||
dictionary.computeIfAbsent(NAME_CODE, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/names.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(VERTEBRATES_CODE, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/vertebrates.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(ADDRESS_CODE, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/addresses.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/NoRedactionIndicator.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
private String cleanDictionaryEntry(String entry) {
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
|
||||
}
|
||||
|
||||
private void loadTypeForTest() {
|
||||
typeColorMap.put("VERTEBRATE", new float[]{0, 1, 0});
|
||||
typeColorMap.put("ADDRESS", new float[]{0, 1, 1});
|
||||
typeColorMap.put("NAME", new float[]{1, 1, 0});
|
||||
typeColorMap.put("NO_REDACTION_INDICATOR", new float[]{1, 0.502f, 0});
|
||||
}
|
||||
|
||||
private List<TypeResult> getTypeResponse() {
|
||||
return typeColorMap.entrySet().stream().map(typeColor -> TypeResult.builder().type(typeColor.getKey()).color(typeColor.getValue()).build()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private DictionaryResponse getDictionaryResponse(String type) {
|
||||
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).build();
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -171,7 +226,5 @@ public class RedactionIntegrationTest {
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("could not load classpath resource: " + path, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,64 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ResourceLoader {
|
||||
|
||||
public Map<String, String> loadDictionaryFiles() {
|
||||
|
||||
String name = "dictionaries/";
|
||||
|
||||
List<String> files;
|
||||
try {
|
||||
files = IOUtils.readLines(ResourceLoader.class.getClassLoader().getResourceAsStream(name), "UTF-8");
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("could not load classpath resource: " + name, e);
|
||||
}
|
||||
return files.stream().collect(Collectors.toMap(ResourceLoader::getFileName, s -> name + s));
|
||||
}
|
||||
|
||||
private String getFileName(String filePath) {
|
||||
return filePath.substring(0, filePath.indexOf(".txt"));
|
||||
}
|
||||
|
||||
public Set<String> load(String classpathPath) {
|
||||
|
||||
URL resource = ResourceLoader.class.getClassLoader().getResource(classpathPath);
|
||||
if (resource == null) {
|
||||
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath);
|
||||
}
|
||||
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) {
|
||||
return br.lines().collect(Collectors.toSet());
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath, e);
|
||||
}
|
||||
}
|
||||
|
||||
public String loadToString(String classpathPath) {
|
||||
|
||||
URL resource = ResourceLoader.class.getClassLoader().getResource(classpathPath);
|
||||
if (resource == null) {
|
||||
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath);
|
||||
}
|
||||
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) {
|
||||
return br.lines().collect(Collectors.joining("\n"));
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextNormalizationUtilities {
|
||||
|
||||
/**
|
||||
* Revert hyphenation due to line breaks.
|
||||
* @param text Text to be processed.
|
||||
* @return Text without line-break hyphenation.
|
||||
*/
|
||||
public static String removeHyphenLineBreaks(String text) {
|
||||
return text.replaceAll("\\s(\\S+)[\\-\\u00AD]\\R|\n\r(.+ )", "\n$1$2");
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user