Pull request #6: RED-106: replace the local dictionary preload with remove dictionary service.

Merge in RED/redaction-service from feature/RED-106i to master

* commit 'd607ac567d1e07cfc4c7dd4fa581d1881a874537':
  RED-106: integrationTest fixed.
  RED-106: integrationTest fixed.
  RED-106: integrationTest fixed.
  REd-106: rebase auf master
  REd-106: rebase auf master
  DEV: bugfix missing bean
  REd-106: enable dictionary version
  RED-106: replace the local dictionary preload with remove dictionary service.
  RED-106: replace the local dictionary preload with remove dictionary service.
This commit is contained in:
Dominique Eiflaender 2020-07-21 13:19:27 +02:00
commit 302daff526
13 changed files with 186 additions and 69 deletions

View File

@ -39,7 +39,7 @@
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>configuration-service-api-v1</artifactId>
<version>1.0.0</version>
<version>1.0.1</version>
</dependency>
<dependency>
<groupId>org.drools</groupId>

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.client;
import org.springframework.cloud.openfeign.FeignClient;
import com.iqser.red.service.configuration.v1.api.resource.DictionaryResource;
import com.iqser.red.service.configuration.v1.api.resource.RulesResource;
@FeignClient(name = RulesResource.SERVICE_NAME, url = "http://" + RulesResource.SERVICE_NAME + ":8080")
public interface DictionaryClient extends DictionaryResource {
}

View File

@ -1,58 +1,56 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.PostConstruct;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import feign.FeignException;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@Slf4j
public class DictionaryService {
public static final String VERTEBRATES_CODE = "VERTEBRATE";
public static final String ADDRESS_CODE = "ADDRESS";
public static final String NAME_CODE = "NAME";
public static final String NO_REDACTION_INDICATOR = "NO_REDACTION_INDICATOR";
private final DictionaryClient dictionaryClient;
private long dictionaryVersion = -1;
@Getter
private Map<String, Set<String>> dictionary = new HashMap<>();
@Getter
private long generation;
@PostConstruct
public void init() {
loadFromResourceFiles();
}
private Map<String, float[]> entryColors = new HashMap<>();
public void updateDictionary() {
//TODO
long version = dictionaryClient.getVersion();
if (version > dictionaryVersion) {
dictionaryVersion = version;
updateDictionaryEntry();
}
}
public void loadFromResourceFiles() {
dictionary.computeIfAbsent(NAME_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/names.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
dictionary.computeIfAbsent(VERTEBRATES_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/vertebrates.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
dictionary.computeIfAbsent(ADDRESS_CODE, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/addresses.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new HashSet<>()).addAll(ResourceLoader.load("dictionaries/NoRedactionIndicator.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toList()));
private void updateDictionaryEntry() {
try {
TypeResponse typeResponse = dictionaryClient.getAllTypes();
if (typeResponse != null && !CollectionUtils.isEmpty(typeResponse.getTypes())) {
entryColors = typeResponse.getTypes().stream().collect(Collectors.toMap(TypeResult::getType, TypeResult::getColor));
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> dictionaryClient.getDictionaryForType(s).getEntries().stream().collect(Collectors.toSet())));
}
} catch (FeignException e) {
log.warn("Got some unknown feignException", e);
throw e;
}
}
private String cleanDictionaryEntry(String entry) {
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
}
}
}

View File

@ -62,7 +62,7 @@ public class DroolsExecutionService {
KieServices kieServices = KieServices.Factory.get();
InputStream input = new ByteArrayInputStream(drlAsString.getBytes(StandardCharsets.UTF_8));
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
kieFileSystem.write(kieServices.getResources().newInputStreamResource(input));
kieFileSystem.write("src/main/resources/drools/rules.drl", kieServices.getResources().newInputStreamResource(input));
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
kieBuilder.buildAll();
KieModule kieModule = kieBuilder.getKieModule();

View File

@ -27,7 +27,6 @@ public class EntityRedactionService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
public void processDocument(Document classifiedDoc) {
dictionaryService.updateDictionary();
@ -98,7 +97,6 @@ public class EntityRedactionService {
});
}
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
String normalizedInputString = searchableText.toString();
@ -130,7 +128,6 @@ public class EntityRedactionService {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
@ -142,6 +139,4 @@ public class EntityRedactionService {
}
entities.removeAll(wordsToRemove);
}
}

View File

@ -1,10 +1,5 @@
package com.iqser.red.service.redaction.v1.server.visualization.service;
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.ADDRESS_CODE;
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.NAME_CODE;
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.NO_REDACTION_INDICATOR;
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.VERTEBRATES_CODE;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
@ -29,6 +24,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@ -40,7 +36,7 @@ import lombok.extern.slf4j.Slf4j;
@Service
@RequiredArgsConstructor
public class AnnotationHighlightService {
private final DictionaryService dictionaryService;
public void highlight(PDDocument document, Document classifiedDoc, boolean flatRedaction) throws IOException {
@ -177,36 +173,20 @@ public class AnnotationHighlightService {
if (!entity.isRedaction()) {
return false;
}
if (entity.getType().equals(ADDRESS_CODE)) {
return true;
if(entity.getType().equalsIgnoreCase("VERTEBRATE") || entity.getType().equalsIgnoreCase("NO_REDACTION_INDICATOR") ){
// TODO in RED-161.
return false;
}
if (entity.getType().equals(NAME_CODE)) {
return true;
}
return false;
return dictionaryService.getDictionary().keySet().contains(entity.getType());
}
private float[] getColor(Entity entity) {
if (!entity.isRedaction()) {
return new float[]{0.627f, 0.627f, 0.627f};
}
if (entity.getType().equals(VERTEBRATES_CODE)) {
return new float[]{0, 1, 0};
}
if (entity.getType().equals(ADDRESS_CODE)) {
return new float[]{0, 1, 1};
}
if (entity.getType().equals(NAME_CODE)) {
return new float[]{1, 1, 0};
}
if (entity.getType().equals(NO_REDACTION_INDICATOR)) {
return new float[]{1, 0.502f, 0};
}
return null;
return dictionaryService.getEntryColors().get(entity.getType());
}
private void visualizeTextBlock(TextBlock textBlock, PDPageContentStream contentStream) throws IOException {
contentStream.setStrokingColor(Color.LIGHT_GRAY);

View File

@ -11,6 +11,11 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.junit.Before;
@ -28,26 +33,44 @@ import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import org.springframework.test.context.junit4.SpringRunner;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
@Ignore
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = DEFINED_PORT)
public class RedactionIntegrationTest {
public static final String VERTEBRATES_CODE = "VERTEBRATE";
public static final String ADDRESS_CODE = "ADDRESS";
public static final String NAME_CODE = "NAME";
public static final String NO_REDACTION_INDICATOR = "NO_REDACTION_INDICATOR";
@Autowired
private RedactionController redactionController;
@MockBean
private RulesClient rulesClient;
@MockBean
private DictionaryClient dictionaryClient;
private Map<String, List<String>> dictionary = new HashMap<>();
private Map<String, float[]> typeColorMap = new HashMap<>();
@TestConfiguration
public static class RedactionIntegrationTestConfiguration {
@ -64,9 +87,7 @@ public class RedactionIntegrationTest {
KieModule kieModule = kieBuilder.getKieModule();
return kieServices.newKieContainer(kieModule.getReleaseId());
}
}
@Before
@ -75,6 +96,40 @@ public class RedactionIntegrationTest {
when(rulesClient.getVersion()).thenReturn(0L);
when(rulesClient.getRules()).thenReturn(new RulesResponse(loadFromClassPath("drools/rules.drl")));
loadDictionaryForTest();
loadTypeForTest();
when(dictionaryClient.getVersion()).thenReturn(0L);
when(dictionaryClient.getAllTypes()).thenReturn(TypeResponse.builder().types(getTypeResponse()).build());
when(dictionaryClient.getDictionaryForType(VERTEBRATES_CODE)).thenReturn(getDictionaryResponse(VERTEBRATES_CODE));
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(getDictionaryResponse(ADDRESS_CODE));
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(getDictionaryResponse(NAME_CODE));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR));
}
private void loadDictionaryForTest() {
dictionary.computeIfAbsent(NAME_CODE, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/names.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
dictionary.computeIfAbsent(VERTEBRATES_CODE, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/vertebrates.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
dictionary.computeIfAbsent(ADDRESS_CODE, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/addresses.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new ArrayList<>()).addAll(ResourceLoader.load("dictionaries/NoRedactionIndicator.txt").stream().map(this::cleanDictionaryEntry).collect(Collectors.toSet()));
}
private String cleanDictionaryEntry(String entry) {
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
}
private void loadTypeForTest() {
typeColorMap.put("VERTEBRATE", new float[]{0, 1, 0});
typeColorMap.put("ADDRESS", new float[]{0, 1, 1});
typeColorMap.put("NAME", new float[]{1, 1, 0});
typeColorMap.put("NO_REDACTION_INDICATOR", new float[]{1, 0.502f, 0});
}
private List<TypeResult> getTypeResponse() {
return typeColorMap.entrySet().stream().map(typeColor -> TypeResult.builder().type(typeColor.getKey()).color(typeColor.getValue()).build()).collect(Collectors.toList());
}
private DictionaryResponse getDictionaryResponse(String type) {
return DictionaryResponse.builder().color(typeColorMap.get(type)).entries(dictionary.get(type)).build();
}
@Test
@ -171,7 +226,5 @@ public class RedactionIntegrationTest {
} catch (IOException e) {
throw new IllegalArgumentException("could not load classpath resource: " + path, e);
}
}
}

View File

@ -0,0 +1,64 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ResourceLoader {
public Map<String, String> loadDictionaryFiles() {
String name = "dictionaries/";
List<String> files;
try {
files = IOUtils.readLines(ResourceLoader.class.getClassLoader().getResourceAsStream(name), "UTF-8");
} catch (IOException e) {
throw new IllegalArgumentException("could not load classpath resource: " + name, e);
}
return files.stream().collect(Collectors.toMap(ResourceLoader::getFileName, s -> name + s));
}
private String getFileName(String filePath) {
return filePath.substring(0, filePath.indexOf(".txt"));
}
public Set<String> load(String classpathPath) {
URL resource = ResourceLoader.class.getClassLoader().getResource(classpathPath);
if (resource == null) {
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath);
}
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) {
return br.lines().collect(Collectors.toSet());
} catch (IOException e) {
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath, e);
}
}
public String loadToString(String classpathPath) {
URL resource = ResourceLoader.class.getClassLoader().getResource(classpathPath);
if (resource == null) {
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath);
}
try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) {
return br.lines().collect(Collectors.joining("\n"));
} catch (IOException e) {
throw new IllegalArgumentException("could not load classpath resource: " + classpathPath, e);
}
}
}

View File

@ -0,0 +1,17 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextNormalizationUtilities {
/**
* Revert hyphenation due to line breaks.
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
public static String removeHyphenLineBreaks(String text) {
return text.replaceAll("\\s(\\S+)[\\-\\u00AD]\\R|\n\r(.+ )", "\n$1$2");
}
}