Pull request #88: Fixed false positive dictionary problems
Merge in RED/redaction-service from FalsePositiveFix to master * commit '609018a051a41504328caf3bd44d27af5848959d': Fixed false positive dictionary problems
This commit is contained in:
commit
22f609a93a
@ -2,10 +2,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
@ -14,6 +12,7 @@ import java.util.regex.Pattern;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -45,6 +44,10 @@ public class Section {
|
||||
|
||||
private Map<String, CellValue> tabularData;
|
||||
|
||||
private Dictionary dictionary;
|
||||
|
||||
private SearchableText searchableText;
|
||||
|
||||
|
||||
public boolean rowEquals(String headerName, String value) {
|
||||
|
||||
@ -243,7 +246,7 @@ public class Section {
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
|
||||
return removeEntitiesContainedInLarger(found);
|
||||
return PositionUtil.clearAndFindPositions(found, searchableText, dictionary);
|
||||
}
|
||||
|
||||
|
||||
@ -253,22 +256,6 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
return entities;
|
||||
}
|
||||
|
||||
|
||||
public void highlightCell(String cellHeader, int ruleNumber, String type) {
|
||||
|
||||
annotateCell(cellHeader, ruleNumber, type, false, false, null, null);
|
||||
@ -309,11 +296,15 @@ public class Section {
|
||||
.getSequences()); // Make sure no other cells with same content are highlighted
|
||||
entity.setLegalBasis(legalBasis);
|
||||
|
||||
Set<Entity> singleEntitySet = new HashSet<>();
|
||||
singleEntitySet.add(entity);
|
||||
PositionUtil.clearAndFindPositions(singleEntitySet, searchableText, dictionary);
|
||||
|
||||
// HashSet keeps the older value, but we want the new only.
|
||||
entities.remove(entity);
|
||||
entities.add(entity);
|
||||
|
||||
entities = removeEntitiesContainedInLarger(entities);
|
||||
PositionUtil.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
if (addAsRecommendations && !isLocal()) {
|
||||
String cleanedWord = word.replaceAll(",", " ").replaceAll(" ", " ").trim() + " ";
|
||||
|
||||
@ -28,6 +28,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionS
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
@ -69,7 +70,7 @@ public class EntityRedactionService {
|
||||
documentEntities.removeAll(foundByLocal);
|
||||
documentEntities.addAll(foundByLocal);
|
||||
|
||||
removeEntitiesContainedInLarger(documentEntities);
|
||||
PositionUtil.removeEntitiesContainedInLarger(documentEntities);
|
||||
}
|
||||
|
||||
for (Entity entity : documentEntities) {
|
||||
@ -136,8 +137,7 @@ public class EntityRedactionService {
|
||||
cellStarts.add(cellStart);
|
||||
start = start + cell.toString().trim().length() + 1;
|
||||
}
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary
|
||||
.getDictionaryModels(), local);
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
@ -151,6 +151,8 @@ public class EntityRedactionService {
|
||||
.headline(table.getHeadline())
|
||||
.sectionNumber(sectionNumber)
|
||||
.tabularData(tabularData)
|
||||
.searchableText(searchableRow)
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableRow));
|
||||
|
||||
sectionNumber++;
|
||||
@ -159,7 +161,7 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local);
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
@ -172,6 +174,8 @@ public class EntityRedactionService {
|
||||
.searchText(searchableText.toString())
|
||||
.headline(paragraph.getHeadline())
|
||||
.sectionNumber(sectionNumber)
|
||||
.searchableText(searchableText)
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableText));
|
||||
|
||||
sectionNumber++;
|
||||
@ -179,7 +183,7 @@ public class EntityRedactionService {
|
||||
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(sectionSearchableTextPair.getSection());
|
||||
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), sectionSearchableTextPair.getSearchableText(), dictionary));
|
||||
documentEntities.addAll(analysedRowSection.getEntities());
|
||||
|
||||
analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> {
|
||||
if (dictionary.isRecommendation(key)) {
|
||||
@ -210,23 +214,8 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
|
||||
|
||||
removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Entity entity : entities) {
|
||||
if (entity.getPositionSequences().isEmpty()) {
|
||||
entity.setPositionSequences(text.getSequences(entity.getWord(), dictionary.isCaseInsensitiveDictionary(entity
|
||||
.getType()), entity.getTargetSequences()));
|
||||
}
|
||||
}
|
||||
|
||||
return entities;
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
|
||||
List<DictionaryModel> dictionary, boolean local) {
|
||||
Dictionary dictionary, boolean local) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
String searchableString = searchableText.toString();
|
||||
@ -235,15 +224,15 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
String lowercaseInputString = searchableString.toLowerCase();
|
||||
for (DictionaryModel model : dictionary) {
|
||||
for (DictionaryModel model : dictionary.getDictionaryModels()) {
|
||||
if (model.isCaseInsensitive()) {
|
||||
found.addAll(find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local));
|
||||
} else {
|
||||
found.addAll(find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local));
|
||||
}
|
||||
}
|
||||
removeEntitiesContainedInLarger(found);
|
||||
return found;
|
||||
|
||||
return PositionUtil.clearAndFindPositions(found, searchableText, dictionary);
|
||||
|
||||
}
|
||||
|
||||
@ -281,22 +270,6 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
|
||||
.getSectionNumber() == inner.getSectionNumber()) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
}
|
||||
|
||||
|
||||
private void addSectionToManualRedactions(List<TextBlock> textBlocks, ManualRedactions manualRedactions,
|
||||
String section, int sectionNumber) {
|
||||
|
||||
|
||||
@ -0,0 +1,72 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class PositionUtil {
|
||||
|
||||
public Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
|
||||
|
||||
Map<String, List<Entity>> entitiesByWord = new HashMap<>();
|
||||
|
||||
for (Entity entity : entities) {
|
||||
entitiesByWord.computeIfAbsent(entity.getWord(), (x) -> new ArrayList<>()).add(entity);
|
||||
}
|
||||
|
||||
for (String word : entitiesByWord.keySet()) {
|
||||
|
||||
List<Entity> orderedEntities = entitiesByWord.get(word)
|
||||
.stream()
|
||||
.sorted(Comparator.comparing(Entity::getStart))
|
||||
.collect(Collectors.toList());
|
||||
Entity firstEntity = orderedEntities.get(0);
|
||||
List<EntityPositionSequence> positionSequences = text.getSequences(firstEntity.getWord(), dictionary.isCaseInsensitiveDictionary(firstEntity
|
||||
.getType()), firstEntity.getTargetSequences());
|
||||
|
||||
for (int i = 0; i <= orderedEntities.size() - 1; i++) {
|
||||
try {
|
||||
orderedEntities.get(i).setPositionSequences(List.of(positionSequences.get(i)));
|
||||
} catch (Exception e){
|
||||
log.warn("Mismatch between EntityPositionSequence and found Entity!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
removeEntitiesContainedInLarger(entities);
|
||||
|
||||
return entities;
|
||||
}
|
||||
|
||||
|
||||
public void removeEntitiesContainedInLarger(Set<Entity> entities) {
|
||||
|
||||
List<Entity> wordsToRemove = new ArrayList<>();
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
|
||||
.getSectionNumber() == inner.getSectionNumber()) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
}
|
||||
entities.removeAll(wordsToRemove);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -48,6 +48,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
|
||||
@ -108,7 +109,7 @@ public class EntityRedactionServiceTest {
|
||||
Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false);
|
||||
entities.add(nested);
|
||||
entities.add(nesting);
|
||||
entityRedactionService.removeEntitiesContainedInLarger(entities);
|
||||
PositionUtil.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
assertThat(entities.size()).isEqualTo(1);
|
||||
assertThat(entities).contains(nesting);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user