diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index d92c01a1..f87e937f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -1,13 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import com.iqser.red.service.redaction.v1.model.FileAttribute; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; -import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; -import lombok.Builder; -import lombok.Data; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.lang3.StringUtils; +import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; import java.util.ArrayList; import java.util.Collection; @@ -20,7 +13,16 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; +import org.apache.commons.lang3.StringUtils; + +import com.iqser.red.service.redaction.v1.model.FileAttribute; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; +import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; + +import lombok.Builder; +import lombok.Data; +import lombok.extern.slf4j.Slf4j; @Data @Slf4j @@ -59,29 +61,57 @@ public class Section { private List fileAttributes = new ArrayList<>(); - public boolean fileAttributeByIdEquals(String id, String value){ - return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent(); - } + public boolean fileAttributeByIdEquals(String id, String value) { - public boolean fileAttributeByPlaceholderEquals(String placeholder, String value){ - return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent(); - } - - public boolean fileAttributeByLabelEquals(String label, String value){ - return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent(); + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())) + .findFirst() + .isPresent(); } - public boolean fileAttributeByIdEqualsIgnoreCase(String id, String value){ - return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent(); + public boolean fileAttributeByPlaceholderEquals(String placeholder, String value) { + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())) + .findFirst() + .isPresent(); } - public boolean fileAttributeByPlaceholderEqualsIgnoreCase(String placeholder, String value){ - return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent(); + + public boolean fileAttributeByLabelEquals(String label, String value) { + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())) + .findFirst() + .isPresent(); } - public boolean fileAttributeByLabelEqualsIgnoreCase(String label, String value){ - return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent(); + + public boolean fileAttributeByIdEqualsIgnoreCase(String id, String value) { + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())) + .findFirst() + .isPresent(); + } + + + public boolean fileAttributeByPlaceholderEqualsIgnoreCase(String placeholder, String value) { + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())) + .findFirst() + .isPresent(); + } + + + public boolean fileAttributeByLabelEqualsIgnoreCase(String label, String value) { + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())) + .findFirst() + .isPresent(); } @@ -121,11 +151,13 @@ public class Section { public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group) { + expandByRegEx(type, pattern, patternCaseInsensitive, group, null); } - public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, String withoutPattern) { + public void expandByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, + String withoutPattern) { Pattern compiledWithoutPattern = null; if (withoutPattern != null) { @@ -141,7 +173,7 @@ public class Section { continue; } - if(withoutPattern != null) { + if (withoutPattern != null) { Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord()); if (matcherWithout.find()) { continue; @@ -152,11 +184,10 @@ public class Section { while (matcher.find()) { String match = matcher.group(group); + if (StringUtils.isNotBlank(match)) { - - expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity - .getRedactionReason(), entity.getLegalBasis())); - + Set expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis()); + expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities)); } } } @@ -222,7 +253,8 @@ public class Section { } - public void ignore(String type){ + public void ignore(String type) { + entities.removeIf(entity -> entity.getType().equals(type)); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index 0daca578..253fe54e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -1,5 +1,16 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; @@ -9,10 +20,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; -import java.util.*; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - @Slf4j @UtilityClass @SuppressWarnings("PMD") @@ -36,8 +43,7 @@ public class EntitySearchUtils { startIndex = inputString.indexOf(cleanValue, stopIndex); stopIndex = startIndex + cleanValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString - .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { return true; } } while (startIndex > -1); @@ -65,8 +71,7 @@ public class EntitySearchUtils { startIndex = inputString.indexOf(cleanValue, stopIndex); stopIndex = startIndex + cleanValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString - .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local, isDossierDictionary)); } } while (startIndex > -1); @@ -120,8 +125,7 @@ public class EntitySearchUtils { for (Entity word : entities) { for (Entity inner : entities) { if (inner.getWord().length() < word.getWord() - .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word - .getSectionNumber() == inner.getSectionNumber()) { + .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) { wordsToRemove.add(inner); } } @@ -154,4 +158,20 @@ public class EntitySearchUtils { entities.addAll(found); } + + public Set findNonOverlappingMatchEntities(Set existingEntities, Set foundEntities) { + + Set result = new HashSet<>(); + if (existingEntities != null && foundEntities != null) { + for (Entity existingEntity : existingEntities) { + for (Entity foundEntity : foundEntities) { + if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) { + result.add(foundEntity); + } + } + } + } + return result; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java new file mode 100644 index 00000000..55c279df --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java @@ -0,0 +1,173 @@ +package com.iqser.red.service.redaction.v1.server.redaction.utils; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; + +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; + +public class EntitySearchUtilsTest { + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedEnd() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false); + Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false); + Entity foundEntities2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(2); + assertThat(result).contains(foundEntities1); + assertThat(result).contains(foundEntities2); + + } + + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedStartAndEndOverlapping() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false); + Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false); + Entity foundEntities2 = new Entity("X. Superman Y.", "fake type", 7, 20, "fake headline", 0, false, false); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities1); + + } + + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedStartAndEnd() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false); + Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false); + Entity foundEntities2 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities1); + + } + + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedExistingAndExpandedEnd() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false); + Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false); + Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities2); + + } + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedEndLong() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false); + Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false); + Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities2); + + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 7b4d5dc5..47ad0da7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -19,8 +19,8 @@ rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) then - section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); - section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); + section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+"); + section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+"); end