diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/EntityRecognitionClient.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/EntityRecognitionClient.java index 336b64f8..df29fc94 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/EntityRecognitionClient.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/EntityRecognitionClient.java @@ -1,13 +1,9 @@ package com.iqser.red.service.redaction.v1.server.client; -import java.util.List; -import java.util.Map; - import org.springframework.cloud.openfeign.FeignClient; import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.PostMapping; -import com.iqser.red.service.redaction.v1.server.client.model.EntityRecogintionEntity; import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionRequest; import com.iqser.red.service.redaction.v1.server.client.model.NerEntities; @@ -16,4 +12,5 @@ public interface EntityRecognitionClient { @PostMapping(value = "/find_authors", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) NerEntities findAuthors(EntityRecognitionRequest entityRecognitionRequest); + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entities.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entities.java new file mode 100644 index 00000000..de14cbbd --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entities.java @@ -0,0 +1,23 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + +import java.util.HashSet; +import java.util.Set; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class Entities { + + @Builder.Default + private Set entities = new HashSet<>(); + + @Builder.Default + private Set nerEntities = new HashSet<>(); + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java index 5dd418ac..c313d734 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java @@ -15,8 +15,8 @@ import java.util.Set; @EqualsAndHashCode(onlyExplicitlyIncluded = true) public class Entity implements ReasonHolder { - private final String word; - private final String type; + private String word; + private String type; private boolean redaction; private String redactionReason; private String legalBasis; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 2672a58b..c913626b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -8,6 +8,8 @@ import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -44,6 +46,8 @@ public class Section { private Set entities; + private Set nerEntities; + // This still contains linebreaks etc. private String text; @@ -67,6 +71,56 @@ public class Section { private List fileAttributes = new ArrayList<>(); + + public void addAiEntities(String type, String asType){ + + Set entitiesOfType = nerEntities.stream().filter(nerEntity -> nerEntity.getType().equals(type)).collect(Collectors.toSet()); + entitiesOfType.forEach(nerEntity -> nerEntity.setType(asType)); + EntitySearchUtils.clearAndFindPositions(entitiesOfType, searchableText, dictionary); + EntitySearchUtils.addEntitiesWithHigherRank(entities, entitiesOfType, dictionary); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + nerEntities.removeAll(entitiesOfType); + } + + + public void combineAiTypes(String startType, String combineTypes, int maxDistanceBetween, String asType){ + + Set combineSet = Set.of(combineTypes.split(",")); + + List sorted = nerEntities.stream().sorted(Comparator.comparing(Entity::getStart)).collect(Collectors.toList()); + Set found = new HashSet<>(); + int start = -1; + int lastEnd = -1; + boolean moreThanOne = false; + for (Entity entity : sorted){ + if(entity.getType().equals(startType) && start == -1){ + lastEnd = entity.getEnd(); + start = entity.getStart(); + } else if(entity.getType().equals(startType) && start != -1){ + if(moreThanOne) { + String value = searchText.substring(start, lastEnd); + found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER)); + } + start = entity.getStart(); + lastEnd = entity.getEnd(); + moreThanOne = false; + } else if(start != -1 && combineSet.contains(entity.getType()) && entity.getStart() - lastEnd < maxDistanceBetween){ + lastEnd = entity.getEnd(); + moreThanOne = true; + } + } + + if(moreThanOne) { + String value = searchText.substring(start, lastEnd); + found.addAll(findEntities(value, asType, false, true, 0, null, null, Engine.NER)); + } + + if(!found.isEmpty()) { + EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + } + } + @WhenCondition public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id, @Argument(ArgumentType.STRING) String value) { @@ -153,6 +207,12 @@ public class Section { } + @WhenCondition + public boolean aiMatchesType(@Argument(ArgumentType.TYPE) String type) { + + return nerEntities.stream().anyMatch(entity -> !entity.isIgnored() && entity.getType().equals(type)); + } + @WhenCondition public boolean matchesType(@Argument(ArgumentType.TYPE) String type) { @@ -217,7 +277,7 @@ public class Section { String match = matcher.group(group); if (StringUtils.isNotBlank(match)) { - Set expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis()); + Set expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis(), Engine.RULE); expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities)); } } @@ -346,7 +406,7 @@ public class Section { while (matcher.find()) { String match = matcher.group(group); if (StringUtils.isNotBlank(match)) { - expanded.addAll(findEntities(entity.getWord() + match, asType, false, false, 0, null, null)); + expanded.addAll(findEntities(entity.getWord() + match, asType, false, false, 0, null, null, Engine.RULE)); } } } @@ -369,7 +429,7 @@ public class Section { while (matcher.find()) { String match = matcher.group(group); if (StringUtils.isNotBlank(match)) { - Set found = findEntities(match.trim(), asType, false, false, 0, null, null); + Set found = findEntities(match.trim(), asType, false, false, 0, null, null, Engine.RULE); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); } } @@ -398,7 +458,7 @@ public class Section { public void addHintAnnotation(@Argument(ArgumentType.STRING) String value, @Argument(ArgumentType.TYPE) String asType) { - Set found = findEntities(value.trim(), asType, true, false, 0, null, null); + Set found = findEntities(value.trim(), asType, true, false, 0, null, null, Engine.RULE); EntitySearchUtils.addEntitiesIgnoreRank(entities, found); } @@ -409,7 +469,7 @@ public class Section { @Argument(ArgumentType.STRING) String reason, @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { - Set found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis); + Set found = findEntities(value.trim(), asType, true, true, ruleNumber, reason, legalBasis, Engine.RULE); EntitySearchUtils.addEntitiesIgnoreRank(entities, found); } @@ -426,7 +486,7 @@ public class Section { if (values != null) { for (String value : values) { if (StringUtils.isNotBlank(value)) { - Set found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis); + Set found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); if (redactEverywhere && !isLocal()) { @@ -479,7 +539,7 @@ public class Section { while (matcher.find()) { String match = matcher.group(group); if (StringUtils.isNotBlank(match)) { - Set found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis); + Set found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); } } @@ -542,7 +602,7 @@ public class Section { for (String value : values) { if (StringUtils.isNotBlank(value)) { - Set found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis); + Set found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); if (redactEverywhere && !isLocal()) { @@ -576,7 +636,7 @@ public class Section { return; } - Set found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis); + Set found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); if (redactEverywhere && !isLocal()) { @@ -620,18 +680,19 @@ public class Section { } - private Set findEntities(@Argument(ArgumentType.STRING) String value, - @Argument(ArgumentType.TYPE) String asType, - @Argument(ArgumentType.BOOLEAN) boolean caseInsensitive, - @Argument(ArgumentType.BOOLEAN) boolean redacted, - @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, - @Argument(ArgumentType.STRING) String reason, - @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { + private Set findEntities(String value, + String asType, + boolean caseInsensitive, + boolean redacted, + int ruleNumber, + String reason, + String legalBasis, + Engine engine) { String text = caseInsensitive ? searchText.toLowerCase() : searchText; String searchValue = caseInsensitive ? value.toLowerCase() : value; - Set found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, Engine.RULE); + Set found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, false, false, engine); found.forEach(entity -> { if (redacted) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java index 1d048208..c0376d89 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/AnnotationService.java @@ -110,7 +110,7 @@ public class AnnotationService { if (redactionLogEntry.isManual()) { return "\nManual Redaction\n\nIn Section : \"" + redactionLogEntry.getSection() + "\""; } - return "\nRule " + redactionLogEntry.getMatchedRule() + " matched\n\n" + redactionLogEntry.getReason() + "\n\nLegal basis:" + redactionLogEntry + return redactionLogEntry.getType() + " \nRule " + redactionLogEntry.getMatchedRule() + " matched\n\n" + redactionLogEntry.getReason() + "\n\nLegal basis:" + redactionLogEntry .getLegalBasis() + "\n\nIn section: \"" + redactionLogEntry.getSection() + "\""; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 820edfff..1955cd46 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -24,6 +24,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.SectionTex import com.iqser.red.service.redaction.v1.server.client.model.NerEntities; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entities; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.Image; @@ -74,14 +75,14 @@ public class EntityRedactionService { List sectionSearchableTextPairs = new ArrayList<>(); for (SectionText reanalysisSection : reanalysisSections) { - Set entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection + Entities entities = findEntities(reanalysisSection.getSearchableText(), reanalysisSection.getHeadline(), reanalysisSection .getSectionNumber(), dictionary, local, nerEntities, reanalysisSection.getCellStarts()); if (reanalysisSection.getCellStarts() != null && !reanalysisSection.getCellStarts().isEmpty()) { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection + surroundingWordsService.addSurroundingText(entities.getEntities(), reanalysisSection.getSearchableText(), dictionary, reanalysisSection .getCellStarts()); } else { - surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); + surroundingWordsService.addSurroundingText(entities.getEntities(), reanalysisSection.getSearchableText(), dictionary); } if (!local && analyzeRequest.getManualRedactions() != null) { @@ -115,7 +116,7 @@ public class EntityRedactionService { } } - entities.forEach(entity -> entity.getPositionSequences().forEach(ps -> { + entities.getEntities().forEach(entity -> entity.getPositionSequences().forEach(ps -> { if (idsToRemove.contains(ps.getId())) { entity.setIgnored(true); } @@ -126,9 +127,10 @@ public class EntityRedactionService { .isLocal(false) .dictionaryTypes(dictionary.getTypes()) .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(reanalysisSection.getSectionNumber()) ? Stream - .concat(entities.stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()) + .concat(entities.getEntities().stream(), hintsPerSectionNumber.get(reanalysisSection.getSectionNumber()) .stream()) - .collect(Collectors.toSet()) : entities) + .collect(Collectors.toSet()) : entities.getEntities()) + .nerEntities(entities.getNerEntities()) .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) .searchText(reanalysisSection.getSearchableText().toString()) .headline(reanalysisSection.getHeadline()) @@ -221,14 +223,14 @@ public class EntityRedactionService { } - private Set findEntities(SearchableText searchableText, String headline, int sectionNumber, - Dictionary dictionary, boolean local, NerEntities nerEntities, - List cellstarts) { + private Entities findEntities(SearchableText searchableText, String headline, int sectionNumber, + Dictionary dictionary, boolean local, NerEntities nerEntities, + List cellstarts) { Set found = new HashSet<>(); String searchableString = searchableText.toString(); if (StringUtils.isEmpty(searchableString)) { - return found; + return new Entities(new HashSet<>(), new HashSet<>()); } String lowercaseInputString = searchableString.toLowerCase(); @@ -242,15 +244,16 @@ public class EntityRedactionService { } } + Set nerFound = new HashSet<>(); if (!local) { Map> nerValuesPerType = getNerValues(sectionNumber, nerEntities, cellstarts); nerValuesPerType.entrySet().forEach(entry -> { - EntitySearchUtils.addEntitiesWithHigherRank(found, EntitySearchUtils.find(searchableString, entry.getValue(), entry + EntitySearchUtils.addEntitiesWithHigherRank(nerFound, EntitySearchUtils.find(searchableString, entry.getValue(), entry .getKey(), headline, sectionNumber, false, false, Engine.NER), dictionary); }); } - return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary); + return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary), nerFound) ; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index cee85a84..024d5b4a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -895,7 +895,7 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { - String fileName = "files/new/Single Study - Oral (Gavage) Mouse.pdf"; + String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf"; long start = System.currentTimeMillis(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt index e308a6f3..1c68c41a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt @@ -1660,4 +1660,4 @@ Zyma SA Zyma SA, Nyon, Switzerland Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK Syngenta Environmental Sciences Jealott’s Hill International Research Centre Bracknell, Berkshire RG42 6EY UK -Test Ignored Hint CBI_ADDRESS +Test Ignored Hint CBI_ADDRESS \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 2ed23658..715d5c27 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -15,6 +15,21 @@ global Section section // section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1); // end + +rule "0: Add CBI_author from ai" + when + Section(aiMatchesType("CBI_author")) + then + section.addAiEntities("CBI_author", "recommendation_CBI_author"); + end + +rule "0: Combine ai types CBI_author from ai" + when + Section(aiMatchesType("ORG")) + then + section.combineAiTypes("ORG", "STREET,POSTAL,COUNTRY,CARDINAL,CITY,STATE", 100, "recommendation_CBI_address"); + end + rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/AiAddressCombine.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/AiAddressCombine.pdf new file mode 100644 index 00000000..055b530d Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/AiAddressCombine.pdf differ