RED-4835 - entity position not calculated correctly for duplicates where 1 is marked as false positive

This commit is contained in:
Timo Bejan 2022-08-08 13:30:18 +03:00
parent 16ea8364df
commit c41e230f85
7 changed files with 108 additions and 119 deletions

View File

@ -1,19 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@ -24,6 +19,7 @@ public class Entity implements ReasonHolder {
private String word;
private String type;
private boolean redaction;
private boolean falsePositive;
private String redactionReason;
private String legalBasis;
private List<EntityPositionSequence> positionSequences = new ArrayList<>();

View File

@ -1,23 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.ArgumentType;
import com.iqser.red.service.redaction.v1.model.Engine;
@ -28,10 +10,19 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Data
@Slf4j
@ -87,6 +78,8 @@ public class Section {
List<String> values = entitiesOfType.stream().map(Entity::getWord).collect(Collectors.toList());
Set<Entity> found = EntitySearchUtils.findEntities(searchText, new SearchImplementation(values, dictionary.isCaseInsensitiveDictionary(asType)), dictionary.getType(asType), new FindEntityDetails(asType, headline, sectionNumber, false, false, Engine.NER, EntityType.RECOMMENDATION));
EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
found = found.stream().filter(f -> !f.isFalsePositive()).collect(Collectors.toSet());
;
Set<Entity> finalResult = new HashSet<>();
@ -886,13 +879,11 @@ public class Section {
}
});
return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
return cleared.stream().filter(f -> !f.isFalsePositive()).collect(Collectors.toSet());
}
private void redact(String type, int ruleNumber, String reason, String legalBasis, boolean redaction) {
entities.forEach(entity -> {

View File

@ -1,18 +1,5 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval;
@ -22,24 +9,21 @@ import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.model.NerEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entities;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@Slf4j
@Service
@ -51,7 +35,6 @@ public class EntityRedactionService {
private final SurroundingWordsService surroundingWordsService;
public PageEntities findEntities(Dictionary dictionary, List<SectionText> sectionTexts, KieContainer kieContainer,
AnalyzeRequest analyzeRequest, NerEntities nerEntities) {
@ -262,7 +245,8 @@ public class EntityRedactionService {
nerFound.addAll(getNerValues(sectionNumber, nerEntities, cellStarts, headline));
}
return new Entities(EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions), nerFound);
var cleared = EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary, manualRedactions);
return new Entities(cleared.stream().filter(e -> !e.isFalsePositive()).collect(Collectors.toSet()), nerFound);
}

View File

@ -1,28 +1,15 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus;
import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@UtilityClass
@SuppressWarnings("PMD")
@ -39,12 +26,10 @@ public class EntitySearchUtils {
if (details.getEntityType() == EntityType.RECOMMENDATION) {
Set<Entity> falseRecommendations = find(inputString, type.getFalseRecommendationsSearch(), details.withEntityType(EntityType.FALSE_RECOMMENDATION));
removeFalsePositives(found, falseRecommendations);
// found.addAll(falseRecommendations);
markFalsePositives(found, falseRecommendations);
} else {
Set<Entity> falsePositives = find(inputString, type.getFalsePositiveSearch(), details.withEntityType(EntityType.FALSE_POSITIVE));
removeFalsePositives(found, falsePositives);
// found.addAll(falsePositives);
markFalsePositives(found, falsePositives);
}
return found;
@ -93,7 +78,7 @@ public class EntitySearchUtils {
}
}
if (manualRedactions != null && manualRedactions.getResizeRedactions() != null && !manualRedactions.getResizeRedactions().isEmpty()){
if (manualRedactions != null && manualRedactions.getResizeRedactions() != null && !manualRedactions.getResizeRedactions().isEmpty()) {
applyResizeRedactions(entities, manualRedactions);
}
@ -131,7 +116,11 @@ public class EntitySearchUtils {
}
public void removeFalsePositives(Set<Entity> entities, Set<Entity> falsePositives) {
public void markFalsePositives(Set<Entity> entities, Set<Entity> falsePositives) {
if (entities.size() == 2) {
log.info("asd");
}
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : falsePositives) {
@ -142,8 +131,12 @@ public class EntitySearchUtils {
}
}
}
wordsToRemove.forEach(entities::remove);
entities.removeAll(falsePositives);
wordsToRemove.forEach(e -> e.setFalsePositive(true));
for (var entity : entities) {
if (falsePositives.contains(entity)) {
entity.setFalsePositive(true);
}
}
}
@ -165,9 +158,9 @@ public class EntitySearchUtils {
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
if (word.getEntityType().equals(EntityType.RECOMMENDATION) && inner.getEntityType().equals(EntityType.ENTITY)) {
wordsToRemove.add(word);
} else if(!(inner.getEntityType() == EntityType.FALSE_RECOMMENDATION && word.getEntityType() == EntityType.ENTITY ||
} else if (!(inner.getEntityType() == EntityType.FALSE_RECOMMENDATION && word.getEntityType() == EntityType.ENTITY ||
inner.getEntityType() == EntityType.ENTITY && word.getEntityType() == EntityType.FALSE_RECOMMENDATION)) {
if(inner.isResized()){
if (inner.isResized()) {
wordsToRemove.add(word);
} else {
wordsToRemove.add(inner);

View File

@ -5,9 +5,11 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.MessageType;
import com.iqser.red.service.redaction.v1.server.annotate.AnnotationService;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import java.time.OffsetDateTime;
@ -19,6 +21,9 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
public static final String FILE_NAME = "test-file";
@Autowired
private AnnotationService annotationService;
@Test
@SneakyThrows
public void testFile() {
@ -81,7 +86,19 @@ public class AnalyseFileRealDataIntegrationTest extends LiveDataIntegrationTest
redactionMessageReceiver.receiveAnalyzeRequest(in, false);
// var log = redactionStorageService.getRedactionLog("dossierId","fileId");
// var redactionLog = redactionStorageService.getRedactionLog("dossierId","fileId");
// redactionLog.getRedactionLogEntry().forEach(entry ->{
// log.info("Entry {}", entry);
// });
//
// var annotated = annotationService.annotate(AnnotateRequest.builder().fileId("fileId").dossierId("dossierId").build());
// annotated.getDocument();
//
// File tempFile = File.createTempFile("annotated",".pdf");
// IOUtils.write(annotated.getDocument(), new FileOutputStream(tempFile));
// log.warn("File saved to: {}",tempFile.getAbsolutePath());
// Runtime.getRuntime().exec("open "+tempFile.getAbsolutePath());
}
}

View File

@ -1,17 +1,21 @@
package com.iqser.red.service.redaction.v1.server.realdata;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyBoolean;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.ArgumentMatchers.nullable;
import static org.mockito.Mockito.when;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.client.*;
import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
@ -30,27 +34,14 @@ import org.springframework.core.io.Resource;
import org.springframework.core.io.support.ResourcePatternResolver;
import org.springframework.test.context.junit4.SpringRunner;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient;
import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import lombok.SneakyThrows;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import static org.mockito.ArgumentMatchers.*;
import static org.mockito.Mockito.when;
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ -143,6 +134,7 @@ public class LiveDataIntegrationTest {
if (found.isPresent()) {
var type = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst().get();
type.setEntries(getEntries(typeName, type.getTypeId()));
type.setFalsePositiveEntries(getFalsePositiveEntries(typeName, type.getTypeId()));
return type;
} else {
@ -196,7 +188,22 @@ public class LiveDataIntegrationTest {
Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**");
for (var resource : dictionaryResources) {
if (Objects.requireNonNull(resource.getFilename()).contains(typeName)) {
if (Objects.requireNonNull(resource.getFilename()).contains(typeName) && !resource.getFilename().contains("false_positive")) {
List<String> lines = IOUtils.readLines(resource.getInputStream());
return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList());
}
}
return new ArrayList<>();
}
@SneakyThrows
private List<DictionaryEntry> getFalsePositiveEntries(String typeName, String typeId) {
Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**");
for (var resource : dictionaryResources) {
if (Objects.requireNonNull(resource.getFilename()).contains(typeName) && resource.getFilename().contains("false_positive")) {
List<String> lines = IOUtils.readLines(resource.getInputStream());
return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList());