RED-4835 - entity position not calculated correctly for duplicates where 1 is marked as false positive
This commit is contained in:
parent
16ea8364df
commit
c41e230f85
@ -1,19 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import lombok.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@ -24,6 +19,7 @@ public class Entity implements ReasonHolder {
|
||||
private String word;
|
||||
private String type;
|
||||
private boolean redaction;
|
||||
private boolean falsePositive;
|
||||
private String redactionReason;
|
||||
private String legalBasis;
|
||||
private List<EntityPositionSequence> positionSequences = new ArrayList<>();
|
||||
|
||||
@ -1,23 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.ArgumentType;
|
||||
import com.iqser.red.service.redaction.v1.model.Engine;
|
||||
@ -28,10 +10,19 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@Slf4j
|
||||
@ -87,6 +78,8 @@ public class Section {
|
||||
List<String> values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toList());
|
||||
Set<Entity> found = EntitySearchUtils.findEntities(searchText, new SearchImplementation(values, dictionary.isCaseInsensitiveDictionary(asType)), dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION));
|
||||
EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
|
||||
found = found.stream().filter(f -> !f.isFalsePositive()).collect(Collectors.toSet());
|
||||
;
|
||||
|
||||
Set<Entity> finalResult = new HashSet<>();
|
||||
|
||||
@ -886,13 +879,11 @@ public class Section {
|
||||
}
|
||||
});
|
||||
|
||||
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
|
||||
var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
|
||||
return cleared.stream().filter(f -> !f.isFalsePositive()).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
private void redact(String type, int ruleNumber, String reason, String legalBasis, boolean redaction) {
|
||||
|
||||
entities.forEach(entity -> {
|
||||
|
||||
@ -1,18 +1,5 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval;
|
||||
@ -22,24 +9,21 @@ import com.iqser.red.service.redaction.v1.model.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -51,7 +35,6 @@ public class EntityRedactionService {
|
||||
private final SurroundingWordsService surroundingWordsService;
|
||||
|
||||
|
||||
|
||||
public PageEntities findEntities(Dictionary dictionary, List<SectionText> sectionTexts, KieContainer kieContainer,
|
||||
AnalyzeRequest analyzeRequest, NerEntities nerEntities) {
|
||||
|
||||
@ -262,7 +245,8 @@ public class EntityRedactionService {
|
||||
nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline));
|
||||
}
|
||||
|
||||
return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions), nerFound);
|
||||
var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
|
||||
return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,28 +1,15 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
@SuppressWarnings("PMD")
|
||||
@ -39,12 +26,10 @@ public class EntitySearchUtils {
|
||||
|
||||
if (details.getEntityType() == EntityType.RECOMMENDATION) {
|
||||
Set<Entity> falseRecommendations = find(inputString, type.getFalseRecommendationsSearch(), details.withEntityType(EntityType.FALSE_RECOMMENDATION));
|
||||
removeFalsePositives(found, falseRecommendations);
|
||||
// found.addAll(falseRecommendations);
|
||||
markFalsePositives(found, falseRecommendations);
|
||||
} else {
|
||||
Set<Entity> falsePositives = find(inputString, type.getFalsePositiveSearch(), details.withEntityType(EntityType.FALSE_POSITIVE));
|
||||
removeFalsePositives(found, falsePositives);
|
||||
// found.addAll(falsePositives);
|
||||
markFalsePositives(found, falsePositives);
|
||||
}
|
||||
|
||||
return found;
|
||||
@ -93,7 +78,7 @@ public class EntitySearchUtils {
|
||||
}
|
||||
}
|
||||
|
||||
if (manualRedactions != null && manualRedactions.getResizeRedactions() != null && !manualRedactions.getResizeRedactions().isEmpty()){
|
||||
if (manualRedactions != null && manualRedactions.getResizeRedactions() != null && !manualRedactions.getResizeRedactions().isEmpty()) {
|
||||
applyResizeRedactions(entities, manualRedactions);
|
||||
}
|
||||
|
||||
@ -131,7 +116,11 @@ public class EntitySearchUtils {
|
||||
}
|
||||
|
||||
|
||||
public void removeFalsePositives(Set<Entity> entities, Set<Entity> falsePositives) {
|
||||
public void markFalsePositives(Set<Entity> entities, Set<Entity> falsePositives) {
|
||||
|
||||
if (entities.size() == 2) {
|
||||
log.info("asd");
|
||||
}
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : falsePositives) {
|
||||
@ -142,8 +131,12 @@ public class EntitySearchUtils {
|
||||
}
|
||||
}
|
||||
}
|
||||
wordsToRemove.forEach(entities::remove);
|
||||
entities.removeAll(falsePositives);
|
||||
wordsToRemove.forEach(e -> e.setFalsePositive(true));
|
||||
for (var entity : entities) {
|
||||
if (falsePositives.contains(entity)) {
|
||||
entity.setFalsePositive(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -165,9 +158,9 @@ public class EntitySearchUtils {
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
|
||||
if (word.getEntityType().equals(EntityType.RECOMMENDATION) && inner.getEntityType().equals(EntityType.ENTITY)) {
|
||||
wordsToRemove.add(word);
|
||||
} else if(!(inner.getEntityType() == EntityType.FALSE_RECOMMENDATION && word.getEntityType() == EntityType.ENTITY ||
|
||||
} else if (!(inner.getEntityType() == EntityType.FALSE_RECOMMENDATION && word.getEntityType() == EntityType.ENTITY ||
|
||||
inner.getEntityType() == EntityType.ENTITY && word.getEntityType() == EntityType.FALSE_RECOMMENDATION)) {
|
||||
if(inner.isResized()){
|
||||
if (inner.isResized()) {
|
||||
wordsToRemove.add(word);
|
||||
} else {
|
||||
wordsToRemove.add(inner);
|
||||
|
||||
@ -5,9 +5,11 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.MessageType;
|
||||
import com.iqser.red.service.redaction.v1.server.annotate.AnnotationService;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.junit.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import java.time.OffsetDateTime;
|
||||
@ -19,6 +21,9 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
|
||||
public static final String FILE_NAME = "test-file";
|
||||
|
||||
@Autowired
|
||||
private AnnotationService annotationService;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testFile() {
|
||||
@ -81,7 +86,19 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
|
||||
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
|
||||
|
||||
|
||||
// var log = redactionStorageService.getRedactionLog("dossierId","fileId");
|
||||
// var redactionLog = redactionStorageService.getRedactionLog("dossierId","fileId");
|
||||
// redactionLog.getRedactionLogEntry().forEach(entry ->{
|
||||
// log.info("Entry {}", entry);
|
||||
// });
|
||||
//
|
||||
// var annotated = annotationService.annotate(AnnotateRequest.builder().fileId("fileId").dossierId("dossierId").build());
|
||||
// annotated.getDocument();
|
||||
//
|
||||
// File tempFile = File.createTempFile("annotated",".pdf");
|
||||
// IOUtils.write(annotated.getDocument(), new FileOutputStream(tempFile));
|
||||
// log.warn("File saved to: {}",tempFile.getAbsolutePath());
|
||||
// Runtime.getRuntime().exec("open "+tempFile.getAbsolutePath());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,17 +1,21 @@
|
||||
package com.iqser.red.service.redaction.v1.server.realdata;
|
||||
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.anyBoolean;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.ArgumentMatchers.nullable;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type;
|
||||
import com.iqser.red.service.redaction.v1.server.Application;
|
||||
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.client.*;
|
||||
import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
@ -30,27 +34,14 @@ import org.springframework.core.io.Resource;
|
||||
import org.springframework.core.io.support.ResourcePatternResolver;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type;
|
||||
import com.iqser.red.service.redaction.v1.server.Application;
|
||||
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.*;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@ -143,6 +134,7 @@ public class LiveDataIntegrationTest {
|
||||
if (found.isPresent()) {
|
||||
var type = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst().get();
|
||||
type.setEntries(getEntries(typeName, type.getTypeId()));
|
||||
type.setFalsePositiveEntries(getFalsePositiveEntries(typeName, type.getTypeId()));
|
||||
|
||||
return type;
|
||||
} else {
|
||||
@ -196,7 +188,22 @@ public class LiveDataIntegrationTest {
|
||||
|
||||
Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**");
|
||||
for (var resource : dictionaryResources) {
|
||||
if (Objects.requireNonNull(resource.getFilename()).contains(typeName)) {
|
||||
if (Objects.requireNonNull(resource.getFilename()).contains(typeName) && !resource.getFilename().contains("false_positive")) {
|
||||
|
||||
List<String> lines = IOUtils.readLines(resource.getInputStream());
|
||||
return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList());
|
||||
|
||||
}
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private List<DictionaryEntry> getFalsePositiveEntries(String typeName, String typeId) {
|
||||
|
||||
Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**");
|
||||
for (var resource : dictionaryResources) {
|
||||
if (Objects.requireNonNull(resource.getFilename()).contains(typeName) && resource.getFilename().contains("false_positive")) {
|
||||
|
||||
List<String> lines = IOUtils.readLines(resource.getInputStream());
|
||||
return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList());
|
||||
|
||||
@ -0,0 +1 @@
|
||||
Salt), CGA376944 (S-enantiomer
|
||||
Loading…
x
Reference in New Issue
Block a user