RED-2845 Bugfix: Avoid the expansion if it would result in a redaction overlap and bugfix in RegExp in drools

This commit is contained in:
Philipp Schramm 2021-12-02 14:29:53 +01:00
parent 44768045a7
commit 3af46fe48c
4 changed files with 268 additions and 43 deletions

View File

@ -1,13 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import java.util.ArrayList;
import java.util.Collection;
@ -20,7 +13,16 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Data
@Slf4j
@ -59,29 +61,57 @@ public class Section {
private List<FileAttribute> fileAttributes = new ArrayList<>();
public boolean fileAttributeByIdEquals(String id, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
public boolean fileAttributeByIdEquals(String id, String value) {
public boolean fileAttributeByPlaceholderEquals(String placeholder, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent();
}
public boolean fileAttributeByLabelEquals(String label, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue()))
.findFirst()
.isPresent();
}
public boolean fileAttributeByIdEqualsIgnoreCase(String id, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
public boolean fileAttributeByPlaceholderEquals(String placeholder, String value) {
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue()))
.findFirst()
.isPresent();
}
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(String placeholder, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
public boolean fileAttributeByLabelEquals(String label, String value) {
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue()))
.findFirst()
.isPresent();
}
public boolean fileAttributeByLabelEqualsIgnoreCase(String label, String value){
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
public boolean fileAttributeByIdEqualsIgnoreCase(String id, String value) {
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue()))
.findFirst()
.isPresent();
}
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(String placeholder, String value) {
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue()))
.findFirst()
.isPresent();
}
public boolean fileAttributeByLabelEqualsIgnoreCase(String label, String value) {
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue()))
.findFirst()
.isPresent();
}
@ -121,11 +151,13 @@ public class Section {
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group) {
expandByRegEx(type, pattern, patternCaseInsensitive, group, null);
}
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, String withoutPattern) {
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group,
String withoutPattern) {
Pattern compiledWithoutPattern = null;
if (withoutPattern != null) {
@ -141,7 +173,7 @@ public class Section {
continue;
}
if(withoutPattern != null) {
if (withoutPattern != null) {
Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord());
if (matcherWithout.find()) {
continue;
@ -152,11 +184,10 @@ public class Section {
while (matcher.find()) {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity
.getRedactionReason(), entity.getLegalBasis()));
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis());
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
}
}
}
@ -222,7 +253,8 @@ public class Section {
}
public void ignore(String type){
public void ignore(String type) {
entities.removeIf(entity -> entity.getType().equals(type));
}

View File

@ -1,5 +1,16 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
@ -9,10 +20,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Slf4j
@UtilityClass
@SuppressWarnings("PMD")
@ -36,8 +43,7 @@ public class EntitySearchUtils {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
return true;
}
} while (startIndex > -1);
@ -65,8 +71,7 @@ public class EntitySearchUtils {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
}
} while (startIndex > -1);
@ -120,8 +125,7 @@ public class EntitySearchUtils {
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
.getSectionNumber() == inner.getSectionNumber()) {
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
wordsToRemove.add(inner);
}
}
@ -154,4 +158,20 @@ public class EntitySearchUtils {
entities.addAll(found);
}
public Set<Entity> findNonOverlappingMatchEntities(Set<Entity> existingEntities, Set<Entity> foundEntities) {
Set<Entity> result = new HashSet<>();
if (existingEntities != null && foundEntities != null) {
for (Entity existingEntity : existingEntities) {
for (Entity foundEntity : foundEntities) {
if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) {
result.add(foundEntity);
}
}
}
}
return result;
}
}

View File

@ -0,0 +1,173 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import static org.assertj.core.api.Assertions.assertThat;
import java.util.HashSet;
import java.util.Set;
import org.junit.Test;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
public class EntitySearchUtilsTest {
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedEnd() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false);
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
Entity foundEntities2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(2);
assertThat(result).contains(foundEntities1);
assertThat(result).contains(foundEntities2);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedStartAndEndOverlapping() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
Entity foundEntities2 = new Entity("X. Superman Y.", "fake type", 7, 20, "fake headline", 0, false, false);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities1);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedStartAndEnd() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
Entity foundEntities2 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities1);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedExistingAndExpandedEnd() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false);
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities2);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedEndLong() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false);
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false);
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities2);
}
}

View File

@ -19,8 +19,8 @@ rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
then
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
end