Pull request #324: RED-3133: Add and combine AI Entities in rules

Merge in RED/redaction-service from RED-3133 to master

* commit '6e720a15c2b9c9d46b52b94f63f0af157ac5d81a':
  RED-3133: Add and combine AI Entities in rules
This commit is contained in:
Dominique Eiflaender 2022-01-20 12:28:17 +01:00
commit c4c96c1712
10 changed files with 137 additions and 38 deletions

View File

@ -1,13 +1,9 @@
package com.iqser.red.service.redaction.v1.server.client;
import java.util.List;
import java.util.Map;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PostMapping;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
@ -16,4 +12,5 @@ public interface EntityRecognitionClient {
@PostMapping(value = "/find_authors", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
NerEntities findAuthors(EntityRecognitionRequest entityRecognitionRequest);
}

View File

@ -0,0 +1,23 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashSet;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Entities {
@Builder.Default
private Set<Entity> entities = new HashSet<>();
@Builder.Default
private Set<Entity> nerEntities = new HashSet<>();
}

View File

@ -15,8 +15,8 @@ import java.util.Set;
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Entity implements ReasonHolder {
private final String word;
private final String type;
private String word;
private String type;
private boolean redaction;
private String redactionReason;
private String legalBasis;

View File

@ -8,6 +8,8 @@ import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -44,6 +46,8 @@ public class Section {
private Set<Entity> entities;
private Set<Entity> nerEntities;
// This still contains linebreaks etc.
private String text;
@ -67,6 +71,56 @@ public class Section {
private List<FileAttribute> fileAttributes = new ArrayList<>();
public void addAiEntities(String type, String asType){
Set<Entity> entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet());
entitiesOfType.forEach(nerEntity -> nerEntity.setType(asType));
EntitySearchUtils.clearAndFindPositions(entitiesOfType, searchableText, dictionary);
EntitySearchUtils.addEntitiesWithHigherRank(entities, entitiesOfType, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
nerEntities.removeAll(entitiesOfType);
}
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType){
Set<String> combineSet = Set.of(combineTypes.split(","));
List<Entity> sorted = nerEntities.stream().sorted(Comparator.comparing(Entity::getStart)).collect(Collectors.toList());
Set<Entity> found = new HashSet<>();
int start = -1;
int lastEnd = -1;
boolean moreThanOne = false;
for (Entity entity : sorted){
if(entity.getType().equals(startType) && start == -1){
lastEnd = entity.getEnd();
start = entity.getStart();
} else if(entity.getType().equals(startType) && start != -1){
if(moreThanOne) {
String value = searchText.substring(start, lastEnd);
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
}
start = entity.getStart();
lastEnd = entity.getEnd();
moreThanOne = false;
} else if(start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween){
lastEnd = entity.getEnd();
moreThanOne = true;
}
}
if(moreThanOne) {
String value = searchText.substring(start, lastEnd);
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
}
if(!found.isEmpty()) {
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
}
@WhenCondition
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
@Argument(ArgumentType.STRING) String value) {
@ -153,6 +207,12 @@ public class Section {
}
@WhenCondition
public boolean aiMatchesType(@Argument(ArgumentType.TYPE) String type) {
return nerEntities.stream().anyMatch(entity -> !entity.isIgnored() && entity.getType().equals(type));
}
@WhenCondition
public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
@ -217,7 +277,7 @@ public class Section {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis());
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE);
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
}
}
@ -346,7 +406,7 @@ public class Section {
while (matcher.find()) {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
expanded.addAll(findEntities(entity.getWord() + match, asType, false, false, 0, null, null));
expanded.addAll(findEntities(entity.getWord() + match, asType, false, false, 0, null, null, Engine.RULE));
}
}
}
@ -369,7 +429,7 @@ public class Section {
while (matcher.find()) {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
Set<Entity> found = findEntities(match.trim(), asType, false, false, 0, null, null);
Set<Entity> found = findEntities(match.trim(), asType, false, false, 0, null, null, Engine.RULE);
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
}
}
@ -398,7 +458,7 @@ public class Section {
public void addHintAnnotation(@Argument(ArgumentType.STRING) String value,
@Argument(ArgumentType.TYPE) String asType) {
Set<Entity> found = findEntities(value.trim(), asType, true, false, 0, null, null);
Set<Entity> found = findEntities(value.trim(), asType, true, false, 0, null, null, Engine.RULE);
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
}
@ -409,7 +469,7 @@ public class Section {
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
Set<Entity> found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis);
Set<Entity> found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis, Engine.RULE);
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
}
@ -426,7 +486,7 @@ public class Section {
if (values != null) {
for (String value : values) {
if (StringUtils.isNotBlank(value)) {
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis);
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
if (redactEverywhere && !isLocal()) {
@ -479,7 +539,7 @@ public class Section {
while (matcher.find()) {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
Set<Entity> found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis);
Set<Entity> found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
}
}
@ -542,7 +602,7 @@ public class Section {
for (String value : values) {
if (StringUtils.isNotBlank(value)) {
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis);
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
if (redactEverywhere && !isLocal()) {
@ -576,7 +636,7 @@ public class Section {
return;
}
Set<Entity> found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis);
Set<Entity> found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
if (redactEverywhere && !isLocal()) {
@ -620,18 +680,19 @@ public class Section {
}
private Set<Entity> findEntities(@Argument(ArgumentType.STRING) String value,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.BOOLEAN) boolean caseInsensitive,
@Argument(ArgumentType.BOOLEAN) boolean redacted,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
private Set<Entity> findEntities(String value,
String asType,
boolean caseInsensitive,
boolean redacted,
int ruleNumber,
String reason,
String legalBasis,
Engine engine) {
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
String searchValue = caseInsensitive ? value.toLowerCase() : value;
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, Engine.RULE);
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, engine);
found.forEach(entity -> {
if (redacted) {

View File

@ -110,7 +110,7 @@ public class AnnotationService {
if (redactionLogEntry.isManual()) {
return "\nManual Redaction\n\nIn Section : \"" + redactionLogEntry.getSection() + "\"";
}
return "\nRule " + redactionLogEntry.getMatchedRule() + " matched\n\n" + redactionLogEntry.getReason() + "\n\nLegal basis:" + redactionLogEntry
return redactionLogEntry.getType() + " \nRule " + redactionLogEntry.getMatchedRule() + " matched\n\n" + redactionLogEntry.getReason() + "\n\nLegal basis:" + redactionLogEntry
.getLegalBasis() + "\n\nIn section: \"" + redactionLogEntry.getSection() + "\"";
}

View File

@ -24,6 +24,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.SectionTex
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
@ -74,14 +75,14 @@ public class EntityRedactionService {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (SectionText reanalysisSection : reanalysisSections) {
Set<Entity> entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
Entities entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
.getSectionNumber(), dictionary, local, nerEntities, reanalysisSection.getCellStarts());
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
surroundingWordsService.addSurroundingText(entities.getEntities(), reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
surroundingWordsService.addSurroundingText(entities.getEntities(), reanalysisSection.getSearchableText(), dictionary);
}
if (!local && analyzeRequest.getManualRedactions() != null) {
@ -115,7 +116,7 @@ public class EntityRedactionService {
}
}
entities.forEach(entity -> entity.getPositionSequences().forEach(ps -> {
entities.getEntities().forEach(entity -> entity.getPositionSequences().forEach(ps -> {
if (idsToRemove.contains(ps.getId())) {
entity.setIgnored(true);
}
@ -126,9 +127,10 @@ public class EntityRedactionService {
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
.concat(entities.getEntities().stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
.stream())
.collect(Collectors.toSet()) : entities)
.collect(Collectors.toSet()) : entities.getEntities())
.nerEntities(entities.getNerEntities())
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
@ -221,14 +223,14 @@ public class EntityRedactionService {
}
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local, NerEntities nerEntities,
List<Integer> cellstarts) {
private Entities findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local, NerEntities nerEntities,
List<Integer> cellstarts) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.toString();
if (StringUtils.isEmpty(searchableString)) {
return found;
return new Entities(new HashSet<>(), new HashSet<>());
}
String lowercaseInputString = searchableString.toLowerCase();
@ -242,15 +244,16 @@ public class EntityRedactionService {
}
}
Set<Entity> nerFound = new HashSet<>();
if (!local) {
Map<String, Set<String>> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts);
nerValuesPerType.entrySet().forEach(entry -> {
EntitySearchUtils.addEntitiesWithHigherRank(found, EntitySearchUtils.find(searchableString, entry.getValue(), entry
EntitySearchUtils.addEntitiesWithHigherRank(nerFound, EntitySearchUtils.find(searchableString, entry.getValue(), entry
.getKey(), headline, sectionNumber, false, false, Engine.NER), dictionary);
});
}
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary), nerFound) ;
}

View File

@ -895,7 +895,7 @@ public class RedactionIntegrationTest {
@Test
public void redactionTest() throws IOException {
String fileName = "files/new/Single Study - Oral (Gavage) Mouse.pdf";
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
long start = System.currentTimeMillis();

View File

@ -1660,4 +1660,4 @@ Zyma SA
Zyma SA, Nyon, Switzerland
Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK
Syngenta Environmental Sciences Jealotts Hill International Research Centre Bracknell, Berkshire RG42 6EY UK
Test Ignored Hint CBI_ADDRESS
Test Ignored Hint CBI_ADDRESS

View File

@ -15,6 +15,21 @@ global Section section
// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
// end
rule "0: Add CBI_author from ai"
when
Section(aiMatchesType("CBI_author"))
then
section.addAiEntities("CBI_author", "recommendation_CBI_author");
end
rule "0: Combine ai types CBI_author from ai"
when
Section(aiMatchesType("ORG"))
then
section.combineAiTypes("ORG", "STREET,POSTAL,COUNTRY,CARDINAL,CITY,STATE", 100, "recommendation_CBI_address");
end
rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))