From 609018a051a41504328caf3bd44d27af5848959d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Mon, 4 Jan 2021 15:32:56 +0100 Subject: [PATCH] Fixed false positive dictionary problems --- .../v1/server/redaction/model/Section.java | 31 +++----- .../service/EntityRedactionService.java | 53 ++++---------- .../server/redaction/utils/PositionUtil.java | 72 +++++++++++++++++++ .../service/EntityRedactionServiceTest.java | 3 +- 4 files changed, 98 insertions(+), 61 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 17bbdbe1..3e534656 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -2,10 +2,8 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; -import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; @@ -14,6 +12,7 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; +import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil; import lombok.Builder; import lombok.Data; @@ -45,6 +44,10 @@ public class Section { private Map tabularData; + private Dictionary dictionary; + + private SearchableText searchableText; + public boolean rowEquals(String headerName, String value) { @@ -243,7 +246,7 @@ public class Section { } } while (startIndex > -1); - return removeEntitiesContainedInLarger(found); + return PositionUtil.clearAndFindPositions(found, searchableText, dictionary); } @@ -253,22 +256,6 @@ public class Section { } - public Set removeEntitiesContainedInLarger(Set entities) { - - List wordsToRemove = new ArrayList<>(); - for (Entity word : entities) { - for (Entity inner : entities) { - if (inner.getWord().length() < word.getWord() - .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner) { - wordsToRemove.add(inner); - } - } - } - entities.removeAll(wordsToRemove); - return entities; - } - - public void highlightCell(String cellHeader, int ruleNumber, String type) { annotateCell(cellHeader, ruleNumber, type, false, false, null, null); @@ -309,11 +296,15 @@ public class Section { .getSequences()); // Make sure no other cells with same content are highlighted entity.setLegalBasis(legalBasis); + Set singleEntitySet = new HashSet<>(); + singleEntitySet.add(entity); + PositionUtil.clearAndFindPositions(singleEntitySet, searchableText, dictionary); + // HashSet keeps the older value, but we want the new only. entities.remove(entity); entities.add(entity); - entities = removeEntitiesContainedInLarger(entities); + PositionUtil.removeEntitiesContainedInLarger(entities); if (addAsRecommendations && !isLocal()) { String cleanedWord = word.replaceAll(",", " ").replaceAll(" ", " ").trim() + " "; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 54dc9669..b599fc2f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -28,6 +28,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionS import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.Section; import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; +import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @@ -69,7 +70,7 @@ public class EntityRedactionService { documentEntities.removeAll(foundByLocal); documentEntities.addAll(foundByLocal); - removeEntitiesContainedInLarger(documentEntities); + PositionUtil.removeEntitiesContainedInLarger(documentEntities); } for (Entity entity : documentEntities) { @@ -136,8 +137,7 @@ public class EntityRedactionService { cellStarts.add(cellStart); start = start + cell.toString().trim().length() + 1; } - Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary - .getDictionaryModels(), local); + Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary, local); surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts); sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() @@ -151,6 +151,8 @@ public class EntityRedactionService { .headline(table.getHeadline()) .sectionNumber(sectionNumber) .tabularData(tabularData) + .searchableText(searchableRow) + .dictionary(dictionary) .build(), searchableRow)); sectionNumber++; @@ -159,7 +161,7 @@ public class EntityRedactionService { } addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber); - Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local); + Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary, local); surroundingWordsService.addSurroundingText(entities, searchableText, dictionary); sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() @@ -172,6 +174,8 @@ public class EntityRedactionService { .searchText(searchableText.toString()) .headline(paragraph.getHeadline()) .sectionNumber(sectionNumber) + .searchableText(searchableText) + .dictionary(dictionary) .build(), searchableText)); sectionNumber++; @@ -179,7 +183,7 @@ public class EntityRedactionService { sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { Section analysedRowSection = droolsExecutionService.executeRules(sectionSearchableTextPair.getSection()); - documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), sectionSearchableTextPair.getSearchableText(), dictionary)); + documentEntities.addAll(analysedRowSection.getEntities()); analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> { if (dictionary.isRecommendation(key)) { @@ -210,23 +214,8 @@ public class EntityRedactionService { } - private Set clearAndFindPositions(Set entities, SearchableText text, Dictionary dictionary) { - - removeEntitiesContainedInLarger(entities); - - for (Entity entity : entities) { - if (entity.getPositionSequences().isEmpty()) { - entity.setPositionSequences(text.getSequences(entity.getWord(), dictionary.isCaseInsensitiveDictionary(entity - .getType()), entity.getTargetSequences())); - } - } - - return entities; - } - - private Set findEntities(SearchableText searchableText, String headline, int sectionNumber, - List dictionary, boolean local) { + Dictionary dictionary, boolean local) { Set found = new HashSet<>(); String searchableString = searchableText.toString(); @@ -235,15 +224,15 @@ public class EntityRedactionService { } String lowercaseInputString = searchableString.toLowerCase(); - for (DictionaryModel model : dictionary) { + for (DictionaryModel model : dictionary.getDictionaryModels()) { if (model.isCaseInsensitive()) { found.addAll(find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber, local)); } else { found.addAll(find(searchableString, model.getValues(local), model.getType(), headline, sectionNumber, local)); } } - removeEntitiesContainedInLarger(found); - return found; + + return PositionUtil.clearAndFindPositions(found, searchableText, dictionary); } @@ -281,22 +270,6 @@ public class EntityRedactionService { } - public void removeEntitiesContainedInLarger(Set entities) { - - List wordsToRemove = new ArrayList<>(); - for (Entity word : entities) { - for (Entity inner : entities) { - if (inner.getWord().length() < word.getWord() - .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word - .getSectionNumber() == inner.getSectionNumber()) { - wordsToRemove.add(inner); - } - } - } - entities.removeAll(wordsToRemove); - } - - private void addSectionToManualRedactions(List textBlocks, ManualRedactions manualRedactions, String section, int sectionNumber) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java new file mode 100644 index 00000000..9f692592 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java @@ -0,0 +1,72 @@ +package com.iqser.red.service.redaction.v1.server.redaction.utils; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; + +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@UtilityClass +public class PositionUtil { + + public Set clearAndFindPositions(Set entities, SearchableText text, Dictionary dictionary) { + + Map> entitiesByWord = new HashMap<>(); + + for (Entity entity : entities) { + entitiesByWord.computeIfAbsent(entity.getWord(), (x) -> new ArrayList<>()).add(entity); + } + + for (String word : entitiesByWord.keySet()) { + + List orderedEntities = entitiesByWord.get(word) + .stream() + .sorted(Comparator.comparing(Entity::getStart)) + .collect(Collectors.toList()); + Entity firstEntity = orderedEntities.get(0); + List positionSequences = text.getSequences(firstEntity.getWord(), dictionary.isCaseInsensitiveDictionary(firstEntity + .getType()), firstEntity.getTargetSequences()); + + for (int i = 0; i <= orderedEntities.size() - 1; i++) { + try { + orderedEntities.get(i).setPositionSequences(List.of(positionSequences.get(i))); + } catch (Exception e){ + log.warn("Mismatch between EntityPositionSequence and found Entity!"); + } + } + } + + removeEntitiesContainedInLarger(entities); + + return entities; + } + + + public void removeEntitiesContainedInLarger(Set entities) { + + List wordsToRemove = new ArrayList<>(); + for (Entity word : entities) { + for (Entity inner : entities) { + if (inner.getWord().length() < word.getWord() + .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word + .getSectionNumber() == inner.getSectionNumber()) { + wordsToRemove.add(inner); + } + } + } + entities.removeAll(wordsToRemove); + } + + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index f7ecded7..605e7c12 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -48,6 +48,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; @@ -108,7 +109,7 @@ public class EntityRedactionServiceTest { Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false); entities.add(nested); entities.add(nesting); - entityRedactionService.removeEntitiesContainedInLarger(entities); + PositionUtil.removeEntitiesContainedInLarger(entities); assertThat(entities.size()).isEqualTo(1); assertThat(entities).contains(nesting);