Pull request #17: Duplicates

Merge in RED/redaction-service from duplicates to master

* commit '81723ce4022e1e006054eb743f2cc2c0b9faf14f':
  Use void method type
  Use EqualsAndHashcode annotation from Lombok
  Fixed duplicated redaction/RedactionLog entries
  RED-211, RED-215 Added dictionaries and rules for testing.
This commit is contained in:
Thierry Goeckel 2020-08-04 10:59:38 +02:00
commit 70804f111d
15 changed files with 215 additions and 61 deletions

View File

@ -4,7 +4,6 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
@ -18,7 +17,7 @@ public class Document {
private List<Page> pages = new ArrayList<>();
private List<Paragraph> paragraphs = new ArrayList<>();
private Map<Integer, Set<Entity>> entities = new HashMap<>();
private Map<Integer, List<Entity>> entities = new HashMap<>();
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter= new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter= new StringFrequencyCounter();

View File

@ -5,21 +5,38 @@ import java.util.ArrayList;
import java.util.List;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode
public class Entity {
private final String word;
private final String type;
@EqualsAndHashCode.Exclude
private boolean redaction;
@EqualsAndHashCode.Exclude
private String redactionReason;
@EqualsAndHashCode.Exclude
private List<EntityPositionSequence> positionSequences = new ArrayList<>();
@EqualsAndHashCode.Exclude
private Integer start;
@EqualsAndHashCode.Exclude
private Integer end;
private String headline;
@EqualsAndHashCode.Exclude
private int matchedRule;
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule) {
private int sectionNumber;
public Entity(String word, String type, boolean redaction, String redactionReason, List<EntityPositionSequence> positionSequences, String headline, int matchedRule, int sectionNumber) {
this.word = word;
this.type = type;
this.redaction = redaction;
@ -27,13 +44,15 @@ public class Entity {
this.positionSequences = positionSequences;
this.headline = headline;
this.matchedRule = matchedRule;
this.sectionNumber = sectionNumber;
}
public Entity(String word, String type, Integer start, Integer end, String headline) {
public Entity(String word, String type, Integer start, Integer end, String headline, int sectionNumber) {
this.word = word;
this.type = type;
this.start = start;
this.end = end;
this.headline = headline;
this.sectionNumber = sectionNumber;
}
}

View File

@ -6,15 +6,20 @@ import java.util.UUID;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode
public class EntityPositionSequence {
@EqualsAndHashCode.Exclude
private List<TextPositionSequence> sequences = new ArrayList<>();
private int pageNumber;
private final UUID id;
}

View File

@ -25,6 +25,8 @@ public class Section {
private String headline;
private int sectionNumber;
public boolean contains(String type) {
@ -32,6 +34,11 @@ public class Section {
}
public boolean headlineContainsWord(String word){
return StringUtils.containsIgnoreCase(headline, word);
}
public void redact(String type, int ruleNumber, String reason) {
entities.forEach(entity -> {
@ -109,13 +116,11 @@ public class Section {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(searchText.charAt(startIndex - 1)) || isSeparator(searchText
.charAt(startIndex - 1))) && (stopIndex == searchText.length() || isSeparator(searchText.charAt(stopIndex)))) {
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline));
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline, sectionNumber));
}
} while (startIndex > -1);
removeEntitiesContainedInLarger(found);
return found;
return removeEntitiesContainedInLarger(found);
}
@ -125,7 +130,7 @@ public class Section {
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
public Set<Entity> removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
@ -137,6 +142,7 @@ public class Section {
}
}
entities.removeAll(wordsToRemove);
return entities;
}
}

View File

@ -73,7 +73,7 @@ public class DictionaryService {
.filter(TypeResult::isCaseInsensitive)
.map(TypeResult::getType)
.collect(Collectors.toList());
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, s -> convertEntries(s)));
dictionary = entryColors.keySet().stream().collect(Collectors.toMap(type -> type, this::convertEntries));
defaultColor = dictionaryClient.getDefaultColor().getColor();
}
} catch (FeignException e) {

View File

@ -1,6 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@ -13,6 +14,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
@ -34,6 +36,7 @@ public class EntityRedactionService {
droolsExecutionService.updateRules();
Set<Entity> documentEntities = new HashSet<>();
int sectionNumber = 1;
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
SearchableText searchableText = paragraph.getSearchableText();
@ -51,57 +54,70 @@ public class EntityRedactionService {
searchableRow.addAll(textBlock.getSequences());
}
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline());
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber);
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber)
.build());
for (Entity entity : analysedRowSection.getEntities()) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedRowSection.getEntities());
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), searchableRow));
sectionNumber++;
}
sectionNumber++;
}
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline());
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber);
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
.entities(entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.sectionNumber(sectionNumber)
.build());
for (Entity entity : analysedSection.getEntities()) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableText.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedSection.getEntities());
documentEntities.addAll(clearAndFindPositions(analysedSection.getEntities(), searchableText));
sectionNumber++;
}
documentEntities.forEach(entity -> {
entity.getPositionSequences().forEach(sequence -> {
for (Entity entity : documentEntities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
classifiedDoc.getEntities()
.computeIfAbsent(sequence.getPageNumber(), (x) -> new HashSet<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), List
.of(sequence), entity.getHeadline(), entity.getMatchedRule()));
});
});
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber()));
}
}
}
private Set<Entity> findEntities(SearchableText searchableText, String headline) {
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text) {
removeEntitiesContainedInLarger(entities);
for (Entity entity : entities) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(text.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(text.getSequences(entity.getWord(), false));
}
}
return entities;
}
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber) {
String inputString = searchableText.toString();
String lowercaseInputString = inputString.toLowerCase();
@ -110,19 +126,20 @@ public class EntityRedactionService {
for (Map.Entry<String, Set<String>> entry : dictionaryService.getDictionary().entrySet()) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) {
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline));
found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline, sectionNumber));
} else {
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline));
found.addAll(find(inputString, entry.getValue(), entry.getKey(), headline, sectionNumber));
}
}
removeEntitiesContainedInLarger(found);
return found;
}
private Set<Entity> find(String inputString, Set<String> values, String type, String headline) {
private Set<Entity> find(String inputString, Set<String> values, String type, String headline, int sectionNumber) {
Set<Entity> found = new HashSet<>();
for (String value : values) {
@ -134,7 +151,7 @@ public class EntityRedactionService {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline));
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber));
}
} while (startIndex > -1);
}

View File

@ -58,6 +58,8 @@ public class RedactionIntegrationTest {
private static final String ADDRESS_CODE = "address";
private static final String NAME_CODE = "name";
private static final String NO_REDACTION_INDICATOR = "no_redaction_indicator";
private static final String REDACTION_INDICATOR = "redaction_indicator";
private static final String HINT_ONLY = "hint_only";
@Autowired
private RedactionController redactionController;
@ -109,6 +111,8 @@ public class RedactionIntegrationTest {
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(getDictionaryResponse(ADDRESS_CODE));
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(getDictionaryResponse(NAME_CODE));
when(dictionaryClient.getDictionaryForType(NO_REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(NO_REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(REDACTION_INDICATOR)).thenReturn(getDictionaryResponse(REDACTION_INDICATOR));
when(dictionaryClient.getDictionaryForType(HINT_ONLY)).thenReturn(getDictionaryResponse(HINT_ONLY));
when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor(new float[]{1f, 0.502f, 0f}));
}
@ -131,7 +135,17 @@ public class RedactionIntegrationTest {
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(NO_REDACTION_INDICATOR, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/NoRedactionIndicator.txt")
.addAll(ResourceLoader.load("dictionaries/no_redaction_indicator.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(REDACTION_INDICATOR, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/redaction_indicator.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
dictionary.computeIfAbsent(HINT_ONLY, v -> new ArrayList<>())
.addAll(ResourceLoader.load("dictionaries/hint_only.txt")
.stream()
.map(this::cleanDictionaryEntry)
.collect(Collectors.toSet()));
@ -149,17 +163,23 @@ public class RedactionIntegrationTest {
typeColorMap.put(VERTEBRATES_CODE, new float[]{0, 1, 0});
typeColorMap.put(ADDRESS_CODE, new float[]{0, 1, 1});
typeColorMap.put(NAME_CODE, new float[]{1, 1, 0});
typeColorMap.put(NO_REDACTION_INDICATOR, new float[]{1, 0.502f, 0});
typeColorMap.put(NO_REDACTION_INDICATOR, new float[]{0.8f, 0, 0.8f});
typeColorMap.put(REDACTION_INDICATOR, new float[]{1, 0.502f, 0.1f});
typeColorMap.put(HINT_ONLY, new float[]{0.8f, 1, 0.8f});
hintTypeMap.put(VERTEBRATES_CODE, true);
hintTypeMap.put(ADDRESS_CODE, false);
hintTypeMap.put(NAME_CODE, false);
hintTypeMap.put(NO_REDACTION_INDICATOR, true);
hintTypeMap.put(REDACTION_INDICATOR, true);
hintTypeMap.put(HINT_ONLY, true);
caseInSensitiveMap.put(VERTEBRATES_CODE, true);
caseInSensitiveMap.put(ADDRESS_CODE, false);
caseInSensitiveMap.put(NAME_CODE, false);
caseInSensitiveMap.put(NO_REDACTION_INDICATOR, true);
caseInSensitiveMap.put(REDACTION_INDICATOR, true);
caseInSensitiveMap.put(HINT_ONLY, true);
}

View File

@ -0,0 +1,50 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.HashSet;
import java.util.Set;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.runtime.KieContainer;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.test.context.junit4.SpringRunner;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
@RunWith(SpringRunner.class)
@SpringBootTest
public class EntityRedactionServiceTest {
@MockBean
private KieContainer kieContainer;
@MockBean
private DroolsExecutionService droolsExecutionService;
@MockBean
private DictionaryService dictionaryService;
@Autowired
private EntityRedactionService entityRedactionService;
@Test
public void testNestedEntitiesRemoval() {
Set<Entity> entities = new HashSet<>();
Entity nested = new Entity("nested", "fake type", 10, 16, "fake headline", 0);
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0);
entities.add(nested);
entities.add(nesting);
entityRedactionService.removeEntitiesContainedInLarger(entities);
assertThat(entities.size()).isEqualTo(1);
assertThat(entities).contains(nesting);
}
}

View File

@ -0,0 +1,2 @@
guideline
unpublished

View File

@ -674,6 +674,7 @@ Barnes E
Blunt H
Bohnenberger S
Broich K
Broich
Cords SM
Cowie D
Davies S
@ -710,6 +711,8 @@ Randall R
Manton J
Nagy K
PetusÁrpásy M
PetusÁrpásy
Petus-Árpásy
Roth M
Shearer J
Sieber M
@ -2380,6 +2383,7 @@ Bowles
Dollenmeier
Myhr
Chang et al
Chang
Beck
Orton
Medjakovic

View File

@ -0,0 +1,3 @@
published paper
in vitro
in-vitro

View File

@ -0,0 +1,9 @@
in vivo
in-vivo
dermal penetration
oral toxicity
oral-toxicity
acute toxicity
acute-toxicity
eco toxicity
eco-toxicity

View File

@ -226,4 +226,5 @@ Poultry
Guinea-pigs
White rabbits
Birds
Wood mice
Wood mice
Mallard

View File

@ -32,27 +32,47 @@ rule "3: Do not redact Names and Addresses if no redaction Indicator is containe
end
rule "4: Redact contact information, if applicant is found"
rule "4: Redact Names and Addresses if no_redaction_indicator and redaction_indicator is contained"
when
eval(section.getText().toLowerCase().contains("applicant"));
eval(section.contains("vertebrate")==true && section.contains("no_redaction_indicator")==true && section.contains("redaction_indicator")==true);
then
section.redactLineAfter("Name:", "address", 4, "Redacted because of Rule 4");
section.redactBetween("Address:", "Contact", "address", 4, "Redacted because of Rule 4");
section.redactLineAfter("Contact point:", "address", 4, "Redacted because of Rule 4");
section.redactLineAfter("Phone:", "address", 4, "Redacted because of Rule 4");
section.redactLineAfter("Fax:", "address", 4, "Redacted because of Rule 4");
section.redactLineAfter("E-mail:", "address", 4, "Redacted because of Rule 4");
section.redact("name", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator");
section.redact("address", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator");
end
rule "5: Redact contact information, if 'Producer of the plant protection product' is found"
rule "5: Do not redact in guideline sections"
when
eval(section.headlineContainsWord("guideline") || section.headlineContainsWord("Guidance"));
then
section.redactNot("name", 5, "Section is a guideline section.");
section.redactNot("address", 5, "Section is a guideline section.");
end
rule "6: Redact contact information, if applicant is found"
when
eval(section.getText().toLowerCase().contains("applicant") == true);
then
section.redactLineAfter("Name:", "address", 6, "contact information was found");
section.redactBetween("Address:", "Contact", "address", 6, "contact information was found");
section.redactLineAfter("Contact point:", "address", 6, "contact information was found");
section.redactLineAfter("Phone:", "address", 6, "contact information was found");
section.redactLineAfter("Fax:", "address", 6, "contact information was found");
section.redactLineAfter("E-mail:", "address", 6, "contact information was found");
section.redactLineAfter("Contact:", "address", 6, "contact information was found");
section.redactLineAfter("Telephone number:", "address", 6, "contact information was found");
end
rule "7: Redact contact information, if 'Producer of the plant protection product' is found"
when
eval(section.getText().contains("Producer of the plant protection product"));
then
section.redactLineAfter("Name:", "address", 5, "xxxx");
section.redactBetween("Address:", "Contact", "address", 5, "xxxx");
section.redactBetween("Contact:", "Phone", "address", 5, "xxxx");
section.redactLineAfter("Phone:", "address", 5, "xxxx");
section.redactLineAfter("Fax:", "address", 5, "xxxx");
section.redactLineAfter("E-mail:", "address", 5, "xxxx");
section.redactLineAfter("Name:", "address", 7, "Producer of the plant protection product was found");
section.redactBetween("Address:", "Contact", "address", 7, "Producer of the plant protection product was found");
section.redactBetween("Contact:", "Phone", "address", 7, "Producer of the plant protection product was found");
section.redactLineAfter("Phone:", "address", 7, "Producer of the plant protection product was found");
section.redactLineAfter("Fax:", "address", 7, "Producer of the plant protection product was found");
section.redactLineAfter("E-mail:", "address", 7, "Producer of the plant protection product was found");
end