diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index c711b6ed..a092f992 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -1,26 +1,34 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; +import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + import com.iqser.red.service.redaction.v1.model.ArgumentType; import com.iqser.red.service.redaction.v1.model.Engine; import com.iqser.red.service.redaction.v1.model.FileAttribute; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; + import lombok.Builder; import lombok.Data; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.lang3.StringUtils; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX; @Data @Slf4j @@ -62,39 +70,69 @@ public class Section { @WhenCondition public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id, @Argument(ArgumentType.STRING) String value) { - return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent(); + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())) + .findFirst() + .isPresent(); } + @WhenCondition public boolean fileAttributeByPlaceholderEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder, @Argument(ArgumentType.STRING) String value) { - return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent(); + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())) + .findFirst() + .isPresent(); } + @WhenCondition public boolean fileAttributeByLabelEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String label, @Argument(ArgumentType.STRING) String value) { - return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent(); + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())) + .findFirst() + .isPresent(); } + @WhenCondition public boolean fileAttributeByIdEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String id, @Argument(ArgumentType.STRING) String value) { - return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent(); + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())) + .findFirst() + .isPresent(); } + @WhenCondition public boolean fileAttributeByPlaceholderEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder, @Argument(ArgumentType.STRING) String value) { - return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent(); + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())) + .findFirst() + .isPresent(); } + @WhenCondition public boolean fileAttributeByLabelEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String label, @Argument(ArgumentType.STRING) String value) { - return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent(); + + return fileAttributes != null && fileAttributes.stream() + .filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())) + .findFirst() + .isPresent(); } + @WhenCondition public boolean rowEquals(@Argument(ArgumentType.STRING) String headerName, @Argument(ArgumentType.STRING) String value) { @@ -106,6 +144,7 @@ public class Section { .equals(value); } + @WhenCondition public boolean hasTableHeader(@Argument(ArgumentType.STRING) String headerName) { @@ -113,18 +152,21 @@ public class Section { return tabularData != null && tabularData.containsKey(cleanHeaderName); } + @WhenCondition public boolean matchesType(@Argument(ArgumentType.TYPE) String type) { return entities.stream().anyMatch(entity -> entity.getType().equals(type)); } + @WhenCondition public boolean matchesImageType(@Argument(ArgumentType.TYPE) String type) { return images.stream().anyMatch(image -> image.getType().equals(type)); } + @WhenCondition public boolean headlineContainsWord(@Argument(ArgumentType.STRING) String word) { @@ -133,16 +175,16 @@ public class Section { @ThenAction - public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, - @Argument(ArgumentType.REGEX) String pattern, + public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group) { + expandByRegEx(type, pattern, patternCaseInsensitive, group, null); } + @ThenAction - public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, - @Argument(ArgumentType.REGEX) String pattern, + public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.REGEX) String withoutPattern) { @@ -162,7 +204,7 @@ public class Section { continue; } - if(withoutPattern != null) { + if (withoutPattern != null) { Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord()); if (matcherWithout.find()) { continue; @@ -173,10 +215,10 @@ public class Section { while (matcher.find()) { String match = matcher.group(group); - if (StringUtils.isNotBlank(match)) { - expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity - .getRedactionReason(), entity.getLegalBasis())); + if (StringUtils.isNotBlank(match)) { + Set expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis()); + expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities)); } } } @@ -185,6 +227,7 @@ public class Section { EntitySearchUtils.removeEntitiesContainedInLarger(entities); } + @ThenAction public void redactImage(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @@ -201,9 +244,9 @@ public class Section { }); } + @ThenAction - public void redact(@Argument(ArgumentType.TYPE) String type, - @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, + public void redact(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.STRING) String reason, @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { @@ -220,6 +263,7 @@ public class Section { }); } + @ThenAction public void redactNotImage(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @@ -234,9 +278,9 @@ public class Section { }); } + @ThenAction - public void redactNot(@Argument(ArgumentType.TYPE) String type, - @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, + public void redactNot(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.STRING) String reason) { boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); @@ -260,12 +304,14 @@ public class Section { @ThenAction public void redactNotAndReference(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REFERENCE_TYPE) String referenceType, - @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, - @Argument(ArgumentType.STRING) String reason) { + @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, + @Argument(ArgumentType.STRING) String reason) { boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); - Set references = entities.stream().filter(entity -> entity.getType().equals(referenceType)).collect(Collectors.toSet()); + Set references = entities.stream() + .filter(entity -> entity.getType().equals(referenceType)) + .collect(Collectors.toSet()); entities.forEach(entity -> { if (entity.getType().equals(type) || hasRecommendationDictionary && entity.getType() @@ -279,7 +325,6 @@ public class Section { } - @ThenAction public void expandToHintAnnotationByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.STRING) String pattern, @@ -310,6 +355,7 @@ public class Section { EntitySearchUtils.removeEntitiesContainedInLarger(entities); } + @ThenAction public void addHintAnnotationByRegEx(@Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @@ -329,6 +375,7 @@ public class Section { } } + @ThenAction public void redactIfPrecededBy(@Argument(ArgumentType.STRING) String prefix, @Argument(ArgumentType.TYPE) String type, @@ -346,6 +393,7 @@ public class Section { }); } + @ThenAction public void addHintAnnotation(@Argument(ArgumentType.STRING) String value, @Argument(ArgumentType.TYPE) String asType) { @@ -354,9 +402,9 @@ public class Section { EntitySearchUtils.addEntitiesIgnoreRank(entities, found); } + @ThenAction - public void addRedaction(@Argument(ArgumentType.STRING) String value, - @Argument(ArgumentType.TYPE) String asType, + public void addRedaction(@Argument(ArgumentType.STRING) String value, @Argument(ArgumentType.TYPE) String asType, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.STRING) String reason, @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { @@ -365,9 +413,9 @@ public class Section { EntitySearchUtils.addEntitiesIgnoreRank(entities, found); } + @ThenAction - public void redactLineAfter(@Argument(ArgumentType.STRING) String start, - @Argument(ArgumentType.TYPE) String asType, + public void redactLineAfter(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.TYPE) String asType, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.BOOLEAN) boolean redactEverywhere, @Argument(ArgumentType.STRING) String reason, @@ -389,6 +437,7 @@ public class Section { } } + @ThenAction public void recommendLineAfter(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.TYPE) String asType) { @@ -414,11 +463,11 @@ public class Section { } } + @ThenAction public void redactByRegEx(@Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, - @Argument(ArgumentType.INTEGER) int group, - @Argument(ArgumentType.TYPE) String asType, + @Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.TYPE) String asType, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.STRING) String reason, @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { @@ -436,6 +485,7 @@ public class Section { } } + @ThenAction public void addRecommendationByRegEx(@Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @@ -454,6 +504,7 @@ public class Section { } } + @ThenAction public void redactAndRecommendByRegEx(@Argument(ArgumentType.REGEX) String pattern, @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, @@ -476,9 +527,9 @@ public class Section { } } + @ThenAction - public void redactBetween(@Argument(ArgumentType.STRING) String start, - @Argument(ArgumentType.STRING) String stop, + public void redactBetween(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.STRING) String stop, @Argument(ArgumentType.TYPE) String asType, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.BOOLEAN) boolean redactEverywhere, @@ -502,6 +553,7 @@ public class Section { } } + @ThenAction public void redactLinesBetween(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.STRING) String stop, @@ -536,6 +588,7 @@ public class Section { } } + @ThenAction public void highlightCell(@Argument(ArgumentType.STRING) String cellHeader, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @@ -544,10 +597,10 @@ public class Section { annotateCell(cellHeader, ruleNumber, type, false, false, null, null); } + @ThenAction public void redactCell(@Argument(ArgumentType.STRING) String cellHeader, - @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, - @Argument(ArgumentType.TYPE) String type, + @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.BOOLEAN) boolean addAsRecommendations, @Argument(ArgumentType.STRING) String reason, @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { @@ -555,6 +608,7 @@ public class Section { annotateCell(cellHeader, ruleNumber, type, true, addAsRecommendations, reason, legalBasis); } + @ThenAction public void redactNotCell(@Argument(ArgumentType.STRING) String cellHeader, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @@ -641,6 +695,7 @@ public class Section { } } + @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) public @interface WhenCondition { @@ -658,6 +713,7 @@ public class Section { public @interface Argument { ArgumentType value() default ArgumentType.STRING; + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index d514df1e..7d101356 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -1,5 +1,17 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + import com.iqser.red.service.redaction.v1.model.Engine; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue; @@ -10,10 +22,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; -import java.util.*; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - @Slf4j @UtilityClass @SuppressWarnings("PMD") @@ -37,8 +45,7 @@ public class EntitySearchUtils { startIndex = inputString.indexOf(cleanValue, stopIndex); stopIndex = startIndex + cleanValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString - .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { return true; } } while (startIndex > -1); @@ -66,8 +73,7 @@ public class EntitySearchUtils { startIndex = inputString.indexOf(cleanValue, stopIndex); stopIndex = startIndex + cleanValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString - .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, isDictionaryEntry, isDossierDictionary, engine)); } } while (startIndex > -1); @@ -121,8 +127,7 @@ public class EntitySearchUtils { for (Entity word : entities) { for (Entity inner : entities) { if (inner.getWord().length() < word.getWord() - .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word - .getSectionNumber() == inner.getSectionNumber()) { + .length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) { wordsToRemove.add(inner); } } @@ -141,14 +146,14 @@ public class EntitySearchUtils { if (entities.contains(found)) { Optional existingOptional = entities.stream().filter(entity -> entity.equals(found)).findFirst(); - if(!existingOptional.isPresent()){ + if (!existingOptional.isPresent()) { return; } var existing = existingOptional.get(); - if(existing.getType().equals(found.getType())){ + if (existing.getType().equals(found.getType())) { existing.getEngines().addAll(found.getEngines()); - } else if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){ + } else if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) { entities.remove(found); entities.add(found); } @@ -165,12 +170,14 @@ public class EntitySearchUtils { } - public void addOrAddEngine(Set existing, Set toBeAdded){ + public void addOrAddEngine(Set existing, Set toBeAdded) { - for(Entity toAdd: toBeAdded){ + for (Entity toAdd : toBeAdded) { if (existing.contains(toAdd)) { - Optional existingOptional = existing.stream().filter(entity -> entity.equals(toAdd)).findFirst(); - if(!existingOptional.isPresent()){ + Optional existingOptional = existing.stream() + .filter(entity -> entity.equals(toAdd)) + .findFirst(); + if (!existingOptional.isPresent()) { return; } var existingEntity = existingOptional.get(); @@ -181,4 +188,20 @@ public class EntitySearchUtils { } } + + public Set findNonOverlappingMatchEntities(Set existingEntities, Set foundEntities) { + + Set result = new HashSet<>(); + if (existingEntities != null && foundEntities != null) { + for (Entity existingEntity : existingEntities) { + for (Entity foundEntity : foundEntities) { + if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) { + result.add(foundEntity); + } + } + } + } + return result; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java index ae2fb019..eec7d419 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java @@ -27,4 +27,164 @@ public class EntitySearchUtilsTest { } + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedEnd() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE); + Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE); + Entity foundEntities2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(2); + assertThat(result).contains(foundEntities1); + assertThat(result).contains(foundEntities2); + + } + + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedStartAndEndOverlapping() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE); + Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE); + Entity foundEntities2 = new Entity("X. Superman Y.", "fake type", 7, 20, "fake headline", 0, false, false, Engine.RULE); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities1); + + } + + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedStartAndEnd() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE); + Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE); + Entity foundEntities2 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities1); + + } + + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedExistingAndExpandedEnd() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE); + Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE); + Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities2); + + } + + /* + * Text: Batman X. Superman Y. + * Position: 0123456789 + * 0123456789 + * 0123456789 + */ + @Test + public void testNotOverlappingEntitiesExpandedEndLong() { + + // Arrange + Set existingEntities = new HashSet<>(); + Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE); + Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE); + existingEntities.add(existingEntity1); + existingEntities.add(existingEntity2); + + Set foundEntities = new HashSet<>(); + Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE); + Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE); + foundEntities.add(foundEntities1); + foundEntities.add(foundEntities2); + + // Act + Set result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities); + + // Assert + assertThat(result.size()).isEqualTo(1); + assertThat(result).contains(foundEntities2); + + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt index e2854b4c..9cf34080 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt @@ -1,3 +1,6 @@ +Foo +F. Bar +B. Foo Johnson R | Weissler M S and Butters C A AD Hurt diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 2ed23658..007fea23 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -19,8 +19,8 @@ rule "0: Expand CBI Authors with firstname initials" when Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author")) then - section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); - section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+"); + section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+"); + section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+"); end diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/foo.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/foo.pdf new file mode 100644 index 00000000..b6ddcbc0 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/foo.pdf differ