Pull request #324: RED-3133: Add and combine AI Entities in rules
Merge in RED/redaction-service from RED-3133 to master * commit '6e720a15c2b9c9d46b52b94f63f0af157ac5d81a': RED-3133: Add and combine AI Entities in rules
This commit is contained in:
commit
c4c96c1712
@ -1,13 +1,9 @@
|
||||
package com.iqser.red.service.redaction.v1.server.client;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest;
|
||||
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
|
||||
|
||||
@ -16,4 +12,5 @@ public interface EntityRecognitionClient {
|
||||
|
||||
@PostMapping(value = "/find_authors", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
|
||||
NerEntities findAuthors(EntityRecognitionRequest entityRecognitionRequest);
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,23 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Entities {
|
||||
|
||||
@Builder.Default
|
||||
private Set<Entity> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
private Set<Entity> nerEntities = new HashSet<>();
|
||||
|
||||
}
|
||||
@ -15,8 +15,8 @@ import java.util.Set;
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Entity implements ReasonHolder {
|
||||
|
||||
private final String word;
|
||||
private final String type;
|
||||
private String word;
|
||||
private String type;
|
||||
private boolean redaction;
|
||||
private String redactionReason;
|
||||
private String legalBasis;
|
||||
|
||||
@ -8,6 +8,8 @@ import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -44,6 +46,8 @@ public class Section {
|
||||
|
||||
private Set<Entity> entities;
|
||||
|
||||
private Set<Entity> nerEntities;
|
||||
|
||||
// This still contains linebreaks etc.
|
||||
private String text;
|
||||
|
||||
@ -67,6 +71,56 @@ public class Section {
|
||||
private List<FileAttribute> fileAttributes = new ArrayList<>();
|
||||
|
||||
|
||||
|
||||
public void addAiEntities(String type, String asType){
|
||||
|
||||
Set<Entity> entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet());
|
||||
entitiesOfType.forEach(nerEntity -> nerEntity.setType(asType));
|
||||
EntitySearchUtils.clearAndFindPositions(entitiesOfType, searchableText, dictionary);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, entitiesOfType, dictionary);
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
nerEntities.removeAll(entitiesOfType);
|
||||
}
|
||||
|
||||
|
||||
public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType){
|
||||
|
||||
Set<String> combineSet = Set.of(combineTypes.split(","));
|
||||
|
||||
List<Entity> sorted = nerEntities.stream().sorted(Comparator.comparing(Entity::getStart)).collect(Collectors.toList());
|
||||
Set<Entity> found = new HashSet<>();
|
||||
int start = -1;
|
||||
int lastEnd = -1;
|
||||
boolean moreThanOne = false;
|
||||
for (Entity entity : sorted){
|
||||
if(entity.getType().equals(startType) && start == -1){
|
||||
lastEnd = entity.getEnd();
|
||||
start = entity.getStart();
|
||||
} else if(entity.getType().equals(startType) && start != -1){
|
||||
if(moreThanOne) {
|
||||
String value = searchText.substring(start, lastEnd);
|
||||
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
|
||||
}
|
||||
start = entity.getStart();
|
||||
lastEnd = entity.getEnd();
|
||||
moreThanOne = false;
|
||||
} else if(start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween){
|
||||
lastEnd = entity.getEnd();
|
||||
moreThanOne = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(moreThanOne) {
|
||||
String value = searchText.substring(start, lastEnd);
|
||||
found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER));
|
||||
}
|
||||
|
||||
if(!found.isEmpty()) {
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
}
|
||||
}
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
@ -153,6 +207,12 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean aiMatchesType(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
return nerEntities.stream().anyMatch(entity -> !entity.isIgnored() && entity.getType().equals(type));
|
||||
}
|
||||
|
||||
@WhenCondition
|
||||
public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
@ -217,7 +277,7 @@ public class Section {
|
||||
String match = matcher.group(group);
|
||||
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis());
|
||||
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE);
|
||||
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
|
||||
}
|
||||
}
|
||||
@ -346,7 +406,7 @@ public class Section {
|
||||
while (matcher.find()) {
|
||||
String match = matcher.group(group);
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
expanded.addAll(findEntities(entity.getWord() + match, asType, false, false, 0, null, null));
|
||||
expanded.addAll(findEntities(entity.getWord() + match, asType, false, false, 0, null, null, Engine.RULE));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -369,7 +429,7 @@ public class Section {
|
||||
while (matcher.find()) {
|
||||
String match = matcher.group(group);
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
Set<Entity> found = findEntities(match.trim(), asType, false, false, 0, null, null);
|
||||
Set<Entity> found = findEntities(match.trim(), asType, false, false, 0, null, null, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
}
|
||||
}
|
||||
@ -398,7 +458,7 @@ public class Section {
|
||||
public void addHintAnnotation(@Argument(ArgumentType.STRING) String value,
|
||||
@Argument(ArgumentType.TYPE) String asType) {
|
||||
|
||||
Set<Entity> found = findEntities(value.trim(), asType, true, false, 0, null, null);
|
||||
Set<Entity> found = findEntities(value.trim(), asType, true, false, 0, null, null, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
|
||||
}
|
||||
|
||||
@ -409,7 +469,7 @@ public class Section {
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
|
||||
|
||||
Set<Entity> found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis);
|
||||
Set<Entity> found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
|
||||
}
|
||||
|
||||
@ -426,7 +486,7 @@ public class Section {
|
||||
if (values != null) {
|
||||
for (String value : values) {
|
||||
if (StringUtils.isNotBlank(value)) {
|
||||
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis);
|
||||
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
|
||||
if (redactEverywhere && !isLocal()) {
|
||||
@ -479,7 +539,7 @@ public class Section {
|
||||
while (matcher.find()) {
|
||||
String match = matcher.group(group);
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
Set<Entity> found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis);
|
||||
Set<Entity> found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
}
|
||||
}
|
||||
@ -542,7 +602,7 @@ public class Section {
|
||||
for (String value : values) {
|
||||
if (StringUtils.isNotBlank(value)) {
|
||||
|
||||
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis);
|
||||
Set<Entity> found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
|
||||
if (redactEverywhere && !isLocal()) {
|
||||
@ -576,7 +636,7 @@ public class Section {
|
||||
return;
|
||||
}
|
||||
|
||||
Set<Entity> found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis);
|
||||
Set<Entity> found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE);
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
|
||||
|
||||
if (redactEverywhere && !isLocal()) {
|
||||
@ -620,18 +680,19 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(@Argument(ArgumentType.STRING) String value,
|
||||
@Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean caseInsensitive,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean redacted,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
|
||||
private Set<Entity> findEntities(String value,
|
||||
String asType,
|
||||
boolean caseInsensitive,
|
||||
boolean redacted,
|
||||
int ruleNumber,
|
||||
String reason,
|
||||
String legalBasis,
|
||||
Engine engine) {
|
||||
|
||||
String text = caseInsensitive ? searchText.toLowerCase() : searchText;
|
||||
String searchValue = caseInsensitive ? value.toLowerCase() : value;
|
||||
|
||||
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, Engine.RULE);
|
||||
Set<Entity> found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, engine);
|
||||
|
||||
found.forEach(entity -> {
|
||||
if (redacted) {
|
||||
|
||||
@ -110,7 +110,7 @@ public class AnnotationService {
|
||||
if (redactionLogEntry.isManual()) {
|
||||
return "\nManual Redaction\n\nIn Section : \"" + redactionLogEntry.getSection() + "\"";
|
||||
}
|
||||
return "\nRule " + redactionLogEntry.getMatchedRule() + " matched\n\n" + redactionLogEntry.getReason() + "\n\nLegal basis:" + redactionLogEntry
|
||||
return redactionLogEntry.getType() + " \nRule " + redactionLogEntry.getMatchedRule() + " matched\n\n" + redactionLogEntry.getReason() + "\n\nLegal basis:" + redactionLogEntry
|
||||
.getLegalBasis() + "\n\nIn section: \"" + redactionLogEntry.getSection() + "\"";
|
||||
}
|
||||
|
||||
|
||||
@ -24,6 +24,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.SectionTex
|
||||
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
@ -74,14 +75,14 @@ public class EntityRedactionService {
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (SectionText reanalysisSection : reanalysisSections) {
|
||||
|
||||
Set<Entity> entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
|
||||
Entities entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection
|
||||
.getSectionNumber(), dictionary, local, nerEntities, reanalysisSection.getCellStarts());
|
||||
|
||||
if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
surroundingWordsService.addSurroundingText(entities.getEntities(), reanalysisSection.getSearchableText(), dictionary, reanalysisSection
|
||||
.getCellStarts());
|
||||
} else {
|
||||
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
|
||||
surroundingWordsService.addSurroundingText(entities.getEntities(), reanalysisSection.getSearchableText(), dictionary);
|
||||
}
|
||||
|
||||
if (!local && analyzeRequest.getManualRedactions() != null) {
|
||||
@ -115,7 +116,7 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
entities.forEach(entity -> entity.getPositionSequences().forEach(ps -> {
|
||||
entities.getEntities().forEach(entity -> entity.getPositionSequences().forEach(ps -> {
|
||||
if (idsToRemove.contains(ps.getId())) {
|
||||
entity.setIgnored(true);
|
||||
}
|
||||
@ -126,9 +127,10 @@ public class EntityRedactionService {
|
||||
.isLocal(false)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream
|
||||
.concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
|
||||
.concat(entities.getEntities().stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber())
|
||||
.stream())
|
||||
.collect(Collectors.toSet()) : entities)
|
||||
.collect(Collectors.toSet()) : entities.getEntities())
|
||||
.nerEntities(entities.getNerEntities())
|
||||
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
|
||||
.searchText(reanalysisSection.getSearchableText().toString())
|
||||
.headline(reanalysisSection.getHeadline())
|
||||
@ -221,14 +223,14 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
|
||||
Dictionary dictionary, boolean local, NerEntities nerEntities,
|
||||
List<Integer> cellstarts) {
|
||||
private Entities findEntities(SearchableText searchableText, String headline, int sectionNumber,
|
||||
Dictionary dictionary, boolean local, NerEntities nerEntities,
|
||||
List<Integer> cellstarts) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
String searchableString = searchableText.toString();
|
||||
if (StringUtils.isEmpty(searchableString)) {
|
||||
return found;
|
||||
return new Entities(new HashSet<>(), new HashSet<>());
|
||||
}
|
||||
|
||||
String lowercaseInputString = searchableString.toLowerCase();
|
||||
@ -242,15 +244,16 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
Set<Entity> nerFound = new HashSet<>();
|
||||
if (!local) {
|
||||
Map<String, Set<String>> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts);
|
||||
nerValuesPerType.entrySet().forEach(entry -> {
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(found, EntitySearchUtils.find(searchableString, entry.getValue(), entry
|
||||
EntitySearchUtils.addEntitiesWithHigherRank(nerFound, EntitySearchUtils.find(searchableString, entry.getValue(), entry
|
||||
.getKey(), headline, sectionNumber, false, false, Engine.NER), dictionary);
|
||||
});
|
||||
}
|
||||
|
||||
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary);
|
||||
return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary), nerFound) ;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -895,7 +895,7 @@ public class RedactionIntegrationTest {
|
||||
@Test
|
||||
public void redactionTest() throws IOException {
|
||||
|
||||
String fileName = "files/new/Single Study - Oral (Gavage) Mouse.pdf";
|
||||
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
@ -1660,4 +1660,4 @@ Zyma SA
|
||||
Zyma SA, Nyon, Switzerland
|
||||
Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK
|
||||
Syngenta Environmental Sciences Jealott’s Hill International Research Centre Bracknell, Berkshire RG42 6EY UK
|
||||
Test Ignored Hint CBI_ADDRESS
|
||||
Test Ignored Hint CBI_ADDRESS
|
||||
@ -15,6 +15,21 @@ global Section section
|
||||
// section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1);
|
||||
// end
|
||||
|
||||
|
||||
rule "0: Add CBI_author from ai"
|
||||
when
|
||||
Section(aiMatchesType("CBI_author"))
|
||||
then
|
||||
section.addAiEntities("CBI_author", "recommendation_CBI_author");
|
||||
end
|
||||
|
||||
rule "0: Combine ai types CBI_author from ai"
|
||||
when
|
||||
Section(aiMatchesType("ORG"))
|
||||
then
|
||||
section.combineAiTypes("ORG", "STREET,POSTAL,COUNTRY,CARDINAL,CITY,STATE", 100, "recommendation_CBI_address");
|
||||
end
|
||||
|
||||
rule "0: Expand CBI Authors with firstname initials"
|
||||
when
|
||||
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user