diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java index 78a6c6f3..ee16afd8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java @@ -1,19 +1,14 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import com.iqser.red.service.redaction.v1.model.Engine; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import lombok.*; + import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; -import com.iqser.red.service.redaction.v1.model.Engine; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; - -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; - @Data @Builder @NoArgsConstructor @@ -24,6 +19,7 @@ public class Entity implements ReasonHolder { private String word; private String type; private boolean redaction; + private boolean falsePositive; private String redactionReason; private String legalBasis; private List positionSequences = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index b74d65e9..4dbb40f7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -1,23 +1,5 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; import com.iqser.red.service.redaction.v1.model.ArgumentType; import com.iqser.red.service.redaction.v1.model.Engine; @@ -28,10 +10,19 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails; import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation; - import lombok.Builder; import lombok.Data; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; @Data @Slf4j @@ -87,6 +78,8 @@ public class Section { List values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toList()); Set found = EntitySearchUtils.findEntities(searchText, new SearchImplementation(values, dictionary.isCaseInsensitiveDictionary(asType)), dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION)); EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions); + found = found.stream().filter(f -> !f.isFalsePositive()).collect(Collectors.toSet()); + ; Set finalResult = new HashSet<>(); @@ -886,13 +879,11 @@ public class Section { } }); - return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions); + var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions); + return cleared.stream().filter(f -> !f.isFalsePositive()).collect(Collectors.toSet()); } - - - private void redact(String type, int ruleNumber, String reason, String legalBasis, boolean redaction) { entities.forEach(entity -> { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 33b41bd6..c9d909e2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -1,18 +1,5 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import org.apache.commons.lang3.StringUtils; -import org.kie.api.runtime.KieContainer; -import org.springframework.stereotype.Service; - import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval; @@ -22,24 +9,21 @@ import com.iqser.red.service.redaction.v1.model.Engine; import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; import com.iqser.red.service.redaction.v1.server.client.model.NerEntities; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entities; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; -import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; -import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; -import com.iqser.red.service.redaction.v1.server.redaction.model.Section; -import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; +import com.iqser.red.service.redaction.v1.server.redaction.model.*; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails; import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; - import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.kie.api.runtime.KieContainer; +import org.springframework.stereotype.Service; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; @Slf4j @Service @@ -51,7 +35,6 @@ public class EntityRedactionService { private final SurroundingWordsService surroundingWordsService; - public PageEntities findEntities(Dictionary dictionary, List sectionTexts, KieContainer kieContainer, AnalyzeRequest analyzeRequest, NerEntities nerEntities) { @@ -262,7 +245,8 @@ public class EntityRedactionService { nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline)); } - return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions), nerFound); + var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions); + return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index aff9a76a..d2169b85 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -1,28 +1,15 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; -import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - +import com.iqser.red.service.redaction.v1.server.redaction.model.*; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; +import java.util.*; +import java.util.stream.Collectors; + @Slf4j @UtilityClass @SuppressWarnings("PMD") @@ -39,12 +26,10 @@ public class EntitySearchUtils { if (details.getEntityType() == EntityType.RECOMMENDATION) { Set falseRecommendations = find(inputString, type.getFalseRecommendationsSearch(), details.withEntityType(EntityType.FALSE_RECOMMENDATION)); - removeFalsePositives(found, falseRecommendations); -// found.addAll(falseRecommendations); + markFalsePositives(found, falseRecommendations); } else { Set falsePositives = find(inputString, type.getFalsePositiveSearch(), details.withEntityType(EntityType.FALSE_POSITIVE)); - removeFalsePositives(found, falsePositives); -// found.addAll(falsePositives); + markFalsePositives(found, falsePositives); } return found; @@ -93,7 +78,7 @@ public class EntitySearchUtils { } } - if (manualRedactions != null && manualRedactions.getResizeRedactions() != null && !manualRedactions.getResizeRedactions().isEmpty()){ + if (manualRedactions != null && manualRedactions.getResizeRedactions() != null && !manualRedactions.getResizeRedactions().isEmpty()) { applyResizeRedactions(entities, manualRedactions); } @@ -131,7 +116,11 @@ public class EntitySearchUtils { } - public void removeFalsePositives(Set entities, Set falsePositives) { + public void markFalsePositives(Set entities, Set falsePositives) { + + if (entities.size() == 2) { + log.info("asd"); + } List wordsToRemove = new ArrayList<>(); for (Entity word : falsePositives) { @@ -142,8 +131,12 @@ public class EntitySearchUtils { } } } - wordsToRemove.forEach(entities::remove); - entities.removeAll(falsePositives); + wordsToRemove.forEach(e -> e.setFalsePositive(true)); + for (var entity : entities) { + if (falsePositives.contains(entity)) { + entity.setFalsePositive(true); + } + } } @@ -165,9 +158,9 @@ public class EntitySearchUtils { .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) { if (word.getEntityType().equals(EntityType.RECOMMENDATION) && inner.getEntityType().equals(EntityType.ENTITY)) { wordsToRemove.add(word); - } else if(!(inner.getEntityType() == EntityType.FALSE_RECOMMENDATION && word.getEntityType() == EntityType.ENTITY || + } else if (!(inner.getEntityType() == EntityType.FALSE_RECOMMENDATION && word.getEntityType() == EntityType.ENTITY || inner.getEntityType() == EntityType.ENTITY && word.getEntityType() == EntityType.FALSE_RECOMMENDATION)) { - if(inner.isResized()){ + if (inner.isResized()) { wordsToRemove.add(word); } else { wordsToRemove.add(inner); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java index 42101901..c1e0c469 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/AnalyseFileRealDataIntegrationTest.java @@ -5,9 +5,11 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; import com.iqser.red.service.redaction.v1.model.MessageType; +import com.iqser.red.service.redaction.v1.server.annotate.AnnotationService; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.junit.Test; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; import java.time.OffsetDateTime; @@ -19,6 +21,9 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest public static final String FILE_NAME = "test-file"; + @Autowired + private AnnotationService annotationService; + @Test @SneakyThrows public void testFile() { @@ -81,7 +86,19 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest redactionMessageReceiver.receiveAnalyzeRequest(in, false); - // var log = redactionStorageService.getRedactionLog("dossierId","fileId"); +// var redactionLog = redactionStorageService.getRedactionLog("dossierId","fileId"); +// redactionLog.getRedactionLogEntry().forEach(entry ->{ +// log.info("Entry {}", entry); +// }); +// +// var annotated = annotationService.annotate(AnnotateRequest.builder().fileId("fileId").dossierId("dossierId").build()); +// annotated.getDocument(); +// +// File tempFile = File.createTempFile("annotated",".pdf"); +// IOUtils.write(annotated.getDocument(), new FileOutputStream(tempFile)); +// log.warn("File saved to: {}",tempFile.getAbsolutePath()); +// Runtime.getRuntime().exec("open "+tempFile.getAbsolutePath()); } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java index c7ea9305..6f76a41a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java @@ -1,17 +1,21 @@ package com.iqser.red.service.redaction.v1.server.realdata; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyBoolean; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.ArgumentMatchers.nullable; -import static org.mockito.Mockito.when; - -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; - +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; +import com.iqser.red.service.redaction.v1.server.Application; +import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; +import com.iqser.red.service.redaction.v1.server.client.*; +import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver; +import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; +import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; +import lombok.SneakyThrows; import org.apache.commons.io.IOUtils; import org.junit.Before; import org.junit.Test; @@ -30,27 +34,14 @@ import org.springframework.core.io.Resource; import org.springframework.core.io.support.ResourcePatternResolver; import org.springframework.test.context.junit4.SpringRunner; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; -import com.iqser.red.service.redaction.v1.server.Application; -import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; -import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; -import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient; -import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient; -import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; -import com.iqser.red.service.redaction.v1.server.client.RulesClient; -import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver; -import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; -import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; -import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; -import com.iqser.red.storage.commons.StorageAutoConfiguration; -import com.iqser.red.storage.commons.service.StorageService; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; -import lombok.SneakyThrows; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import static org.mockito.ArgumentMatchers.*; +import static org.mockito.Mockito.when; @RunWith(SpringRunner.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @@ -143,6 +134,7 @@ public class LiveDataIntegrationTest { if (found.isPresent()) { var type = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst().get(); type.setEntries(getEntries(typeName, type.getTypeId())); + type.setFalsePositiveEntries(getFalsePositiveEntries(typeName, type.getTypeId())); return type; } else { @@ -196,7 +188,22 @@ public class LiveDataIntegrationTest { Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**"); for (var resource : dictionaryResources) { - if (Objects.requireNonNull(resource.getFilename()).contains(typeName)) { + if (Objects.requireNonNull(resource.getFilename()).contains(typeName) && !resource.getFilename().contains("false_positive")) { + + List lines = IOUtils.readLines(resource.getInputStream()); + return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList()); + + } + } + return new ArrayList<>(); + } + + @SneakyThrows + private List getFalsePositiveEntries(String typeName, String typeId) { + + Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**"); + for (var resource : dictionaryResources) { + if (Objects.requireNonNull(resource.getFilename()).contains(typeName) && resource.getFilename().contains("false_positive")) { List lines = IOUtils.readLines(resource.getInputStream()); return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/performance/dictionaries/EFSA_sanitisation_GFL_v1/dictionaries/CBI_author_false_positive.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/performance/dictionaries/EFSA_sanitisation_GFL_v1/dictionaries/CBI_author_false_positive.txt new file mode 100644 index 00000000..82e4dac5 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/performance/dictionaries/EFSA_sanitisation_GFL_v1/dictionaries/CBI_author_false_positive.txt @@ -0,0 +1 @@ +Salt), CGA376944 (S-enantiomer