RED-2845 Bugfix: Avoid the expansion if it would result in a redaction overlap and bugfix in RegExp in drools
This commit is contained in:
parent
44768045a7
commit
3af46fe48c
@ -1,13 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.FileAttribute;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
@ -20,7 +13,16 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.FileAttribute;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Data
|
||||
@Slf4j
|
||||
@ -59,29 +61,57 @@ public class Section {
|
||||
private List<FileAttribute> fileAttributes = new ArrayList<>();
|
||||
|
||||
|
||||
public boolean fileAttributeByIdEquals(String id, String value){
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent();
|
||||
}
|
||||
public boolean fileAttributeByIdEquals(String id, String value) {
|
||||
|
||||
public boolean fileAttributeByPlaceholderEquals(String placeholder, String value){
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent();
|
||||
}
|
||||
|
||||
public boolean fileAttributeByLabelEquals(String label, String value){
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent();
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
public boolean fileAttributeByIdEqualsIgnoreCase(String id, String value){
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
|
||||
public boolean fileAttributeByPlaceholderEquals(String placeholder, String value) {
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(String placeholder, String value){
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
public boolean fileAttributeByLabelEquals(String label, String value) {
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
public boolean fileAttributeByLabelEqualsIgnoreCase(String label, String value){
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
public boolean fileAttributeByIdEqualsIgnoreCase(String id, String value) {
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(String placeholder, String value) {
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
public boolean fileAttributeByLabelEqualsIgnoreCase(String label, String value) {
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@ -121,11 +151,13 @@ public class Section {
|
||||
|
||||
|
||||
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group) {
|
||||
|
||||
expandByRegEx(type, pattern, patternCaseInsensitive, group, null);
|
||||
}
|
||||
|
||||
|
||||
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, String withoutPattern) {
|
||||
public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group,
|
||||
String withoutPattern) {
|
||||
|
||||
Pattern compiledWithoutPattern = null;
|
||||
if (withoutPattern != null) {
|
||||
@ -141,7 +173,7 @@ public class Section {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(withoutPattern != null) {
|
||||
if (withoutPattern != null) {
|
||||
Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord());
|
||||
if (matcherWithout.find()) {
|
||||
continue;
|
||||
@ -152,11 +184,10 @@ public class Section {
|
||||
|
||||
while (matcher.find()) {
|
||||
String match = matcher.group(group);
|
||||
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
|
||||
expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity
|
||||
.getRedactionReason(), entity.getLegalBasis()));
|
||||
|
||||
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis());
|
||||
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -222,7 +253,8 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public void ignore(String type){
|
||||
public void ignore(String type) {
|
||||
|
||||
entities.removeIf(entity -> entity.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,16 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
@ -9,10 +20,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
@SuppressWarnings("PMD")
|
||||
@ -36,8 +43,7 @@ public class EntitySearchUtils {
|
||||
startIndex = inputString.indexOf(cleanValue, stopIndex);
|
||||
stopIndex = startIndex + cleanValue.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
return true;
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
@ -65,8 +71,7 @@ public class EntitySearchUtils {
|
||||
startIndex = inputString.indexOf(cleanValue, stopIndex);
|
||||
stopIndex = startIndex + cleanValue.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
@ -120,8 +125,7 @@ public class EntitySearchUtils {
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
|
||||
.getSectionNumber() == inner.getSectionNumber()) {
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
@ -154,4 +158,20 @@ public class EntitySearchUtils {
|
||||
entities.addAll(found);
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> findNonOverlappingMatchEntities(Set<Entity> existingEntities, Set<Entity> foundEntities) {
|
||||
|
||||
Set<Entity> result = new HashSet<>();
|
||||
if (existingEntities != null && foundEntities != null) {
|
||||
for (Entity existingEntity : existingEntities) {
|
||||
for (Entity foundEntity : foundEntities) {
|
||||
if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) {
|
||||
result.add(foundEntity);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,173 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
|
||||
public class EntitySearchUtilsTest {
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedEnd() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false);
|
||||
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
|
||||
Entity foundEntities2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(2);
|
||||
assertThat(result).contains(foundEntities1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedStartAndEndOverlapping() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
|
||||
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
|
||||
Entity foundEntities2 = new Entity("X. Superman Y.", "fake type", 7, 20, "fake headline", 0, false, false);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedStartAndEnd() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
|
||||
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
|
||||
Entity foundEntities2 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedExistingAndExpandedEnd() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false);
|
||||
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false);
|
||||
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedEndLong() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false);
|
||||
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false);
|
||||
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -19,8 +19,8 @@ rule "0: Expand CBI Authors with firstname initials"
|
||||
when
|
||||
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
|
||||
then
|
||||
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
|
||||
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
|
||||
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
|
||||
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
|
||||
end
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user