Pull request #88: Fixed false positive dictionary problems

Merge in RED/redaction-service from FalsePositiveFix to master

* commit '609018a051a41504328caf3bd44d27af5848959d':
  Fixed false positive dictionary problems
This commit is contained in:
Dominique Eiflaender 2021-01-04 16:51:10 +01:00
commit 22f609a93a
4 changed files with 98 additions and 61 deletions

View File

@ -2,10 +2,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
@ -14,6 +12,7 @@ import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil;
import lombok.Builder;
import lombok.Data;
@ -45,6 +44,10 @@ public class Section {
private Map<String, CellValue> tabularData;
private Dictionary dictionary;
private SearchableText searchableText;
public boolean rowEquals(String headerName, String value) {
@ -243,7 +246,7 @@ public class Section {
}
} while (startIndex > -1);
return removeEntitiesContainedInLarger(found);
return PositionUtil.clearAndFindPositions(found, searchableText, dictionary);
}
@ -253,22 +256,6 @@ public class Section {
}
public Set<Entity> removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
wordsToRemove.add(inner);
}
}
}
entities.removeAll(wordsToRemove);
return entities;
}
public void highlightCell(String cellHeader, int ruleNumber, String type) {
annotateCell(cellHeader, ruleNumber, type, false, false, null, null);
@ -309,11 +296,15 @@ public class Section {
.getSequences()); // Make sure no other cells with same content are highlighted
entity.setLegalBasis(legalBasis);
Set<Entity> singleEntitySet = new HashSet<>();
singleEntitySet.add(entity);
PositionUtil.clearAndFindPositions(singleEntitySet, searchableText, dictionary);
// HashSet keeps the older value, but we want the new only.
entities.remove(entity);
entities.add(entity);
entities = removeEntitiesContainedInLarger(entities);
PositionUtil.removeEntitiesContainedInLarger(entities);
if (addAsRecommendations && !isLocal()) {
String cleanedWord = word.replaceAll(",", " ").replaceAll(" ", " ").trim() + " ";

View File

@ -28,6 +28,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionS
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@ -69,7 +70,7 @@ public class EntityRedactionService {
documentEntities.removeAll(foundByLocal);
documentEntities.addAll(foundByLocal);
removeEntitiesContainedInLarger(documentEntities);
PositionUtil.removeEntitiesContainedInLarger(documentEntities);
}
for (Entity entity : documentEntities) {
@ -136,8 +137,7 @@ public class EntityRedactionService {
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary
.getDictionaryModels(), local);
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
@ -151,6 +151,8 @@ public class EntityRedactionService {
.headline(table.getHeadline())
.sectionNumber(sectionNumber)
.tabularData(tabularData)
.searchableText(searchableRow)
.dictionary(dictionary)
.build(), searchableRow));
sectionNumber++;
@ -159,7 +161,7 @@ public class EntityRedactionService {
}
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary, local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
@ -172,6 +174,8 @@ public class EntityRedactionService {
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.sectionNumber(sectionNumber)
.searchableText(searchableText)
.dictionary(dictionary)
.build(), searchableText));
sectionNumber++;
@ -179,7 +183,7 @@ public class EntityRedactionService {
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(sectionSearchableTextPair.getSection());
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), sectionSearchableTextPair.getSearchableText(), dictionary));
documentEntities.addAll(analysedRowSection.getEntities());
analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> {
if (dictionary.isRecommendation(key)) {
@ -210,23 +214,8 @@ public class EntityRedactionService {
}
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
removeEntitiesContainedInLarger(entities);
for (Entity entity : entities) {
if (entity.getPositionSequences().isEmpty()) {
entity.setPositionSequences(text.getSequences(entity.getWord(), dictionary.isCaseInsensitiveDictionary(entity
.getType()), entity.getTargetSequences()));
}
}
return entities;
}
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
List<DictionaryModel> dictionary, boolean local) {
Dictionary dictionary, boolean local) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.toString();
@ -235,15 +224,15 @@ public class EntityRedactionService {
}
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionary) {
for (DictionaryModel model : dictionary.getDictionaryModels()) {
if (model.isCaseInsensitive()) {
found.addAll(find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local));
} else {
found.addAll(find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local));
}
}
removeEntitiesContainedInLarger(found);
return found;
return PositionUtil.clearAndFindPositions(found, searchableText, dictionary);
}
@ -281,22 +270,6 @@ public class EntityRedactionService {
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
.getSectionNumber() == inner.getSectionNumber()) {
wordsToRemove.add(inner);
}
}
}
entities.removeAll(wordsToRemove);
}
private void addSectionToManualRedactions(List<TextBlock> textBlocks, ManualRedactions manualRedactions,
String section, int sectionNumber) {

View File

@ -0,0 +1,72 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@UtilityClass
public class PositionUtil {
public Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
Map<String, List<Entity>> entitiesByWord = new HashMap<>();
for (Entity entity : entities) {
entitiesByWord.computeIfAbsent(entity.getWord(), (x) -> new ArrayList<>()).add(entity);
}
for (String word : entitiesByWord.keySet()) {
List<Entity> orderedEntities = entitiesByWord.get(word)
.stream()
.sorted(Comparator.comparing(Entity::getStart))
.collect(Collectors.toList());
Entity firstEntity = orderedEntities.get(0);
List<EntityPositionSequence> positionSequences = text.getSequences(firstEntity.getWord(), dictionary.isCaseInsensitiveDictionary(firstEntity
.getType()), firstEntity.getTargetSequences());
for (int i = 0; i <= orderedEntities.size() - 1; i++) {
try {
orderedEntities.get(i).setPositionSequences(List.of(positionSequences.get(i)));
} catch (Exception e){
log.warn("Mismatch between EntityPositionSequence and found Entity!");
}
}
}
removeEntitiesContainedInLarger(entities);
return entities;
}
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
List<Entity> wordsToRemove = new ArrayList<>();
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
.getSectionNumber() == inner.getSectionNumber()) {
wordsToRemove.add(inner);
}
}
}
entities.removeAll(wordsToRemove);
}
}

View File

@ -48,6 +48,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
@ -108,7 +109,7 @@ public class EntityRedactionServiceTest {
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false);
entities.add(nested);
entities.add(nesting);
entityRedactionService.removeEntitiesContainedInLarger(entities);
PositionUtil.removeEntitiesContainedInLarger(entities);
assertThat(entities.size()).isEqualTo(1);
assertThat(entities).contains(nesting);