Pull request #286: Bugfix/RED-2845 3.0

Merge in RED/redaction-service from bugfix/RED-2845_3.0 to master

* commit '1c982a40422a16e9caa8d264e4221524c7e1c92b':
  RED-2845 Bugfix: Avoid the expansion if it would result in a redaction overlap and bugfix in RegExp in drools
This commit is contained in:
Philipp Schramm 2021-12-02 14:07:45 +01:00
commit cda8e51d9d
6 changed files with 305 additions and 63 deletions

View File

@ -1,26 +1,34 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.model.ArgumentType;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
@Data
@Slf4j
@ -62,39 +70,69 @@ public class Section {
@WhenCondition
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue()))
.findFirst()
.isPresent();
}
@WhenCondition
public boolean fileAttributeByPlaceholderEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue()))
.findFirst()
.isPresent();
}
@WhenCondition
public boolean fileAttributeByLabelEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String label,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue()))
.findFirst()
.isPresent();
}
@WhenCondition
public boolean fileAttributeByIdEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue()))
.findFirst()
.isPresent();
}
@WhenCondition
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue()))
.findFirst()
.isPresent();
}
@WhenCondition
public boolean fileAttributeByLabelEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String label,
@Argument(ArgumentType.STRING) String value) {
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
return fileAttributes != null && fileAttributes.stream()
.filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue()))
.findFirst()
.isPresent();
}
@WhenCondition
public boolean rowEquals(@Argument(ArgumentType.STRING) String headerName,
@Argument(ArgumentType.STRING) String value) {
@ -106,6 +144,7 @@ public class Section {
.equals(value);
}
@WhenCondition
public boolean hasTableHeader(@Argument(ArgumentType.STRING) String headerName) {
@ -113,18 +152,21 @@ public class Section {
return tabularData != null && tabularData.containsKey(cleanHeaderName);
}
@WhenCondition
public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
return entities.stream().anyMatch(entity -> entity.getType().equals(type));
}
@WhenCondition
public boolean matchesImageType(@Argument(ArgumentType.TYPE) String type) {
return images.stream().anyMatch(image -> image.getType().equals(type));
}
@WhenCondition
public boolean headlineContainsWord(@Argument(ArgumentType.STRING) String word) {
@ -133,16 +175,16 @@ public class Section {
@ThenAction
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String pattern,
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group) {
expandByRegEx(type, pattern, patternCaseInsensitive, group, null);
}
@ThenAction
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REGEX) String pattern,
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.REGEX) String withoutPattern) {
@ -162,7 +204,7 @@ public class Section {
continue;
}
if(withoutPattern != null) {
if (withoutPattern != null) {
Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord());
if (matcherWithout.find()) {
continue;
@ -173,10 +215,10 @@ public class Section {
while (matcher.find()) {
String match = matcher.group(group);
if (StringUtils.isNotBlank(match)) {
expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity
.getRedactionReason(), entity.getLegalBasis()));
if (StringUtils.isNotBlank(match)) {
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis());
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
}
}
}
@ -185,6 +227,7 @@ public class Section {
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
@ThenAction
public void redactImage(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@ -201,9 +244,9 @@ public class Section {
});
}
@ThenAction
public void redact(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
public void redact(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
@ -220,6 +263,7 @@ public class Section {
});
}
@ThenAction
public void redactNotImage(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@ -234,9 +278,9 @@ public class Section {
});
}
@ThenAction
public void redactNot(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
public void redactNot(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason) {
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
@ -260,12 +304,14 @@ public class Section {
@ThenAction
public void redactNotAndReference(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.REFERENCE_TYPE) String referenceType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason) {
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason) {
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
Set<Entity> references = entities.stream().filter(entity -> entity.getType().equals(referenceType)).collect(Collectors.toSet());
Set<Entity> references = entities.stream()
.filter(entity -> entity.getType().equals(referenceType))
.collect(Collectors.toSet());
entities.forEach(entity -> {
if (entity.getType().equals(type) || hasRecommendationDictionary && entity.getType()
@ -279,7 +325,6 @@ public class Section {
}
@ThenAction
public void expandToHintAnnotationByRegEx(@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.STRING) String pattern,
@ -310,6 +355,7 @@ public class Section {
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
}
@ThenAction
public void addHintAnnotationByRegEx(@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@ -329,6 +375,7 @@ public class Section {
}
}
@ThenAction
public void redactIfPrecededBy(@Argument(ArgumentType.STRING) String prefix,
@Argument(ArgumentType.TYPE) String type,
@ -346,6 +393,7 @@ public class Section {
});
}
@ThenAction
public void addHintAnnotation(@Argument(ArgumentType.STRING) String value,
@Argument(ArgumentType.TYPE) String asType) {
@ -354,9 +402,9 @@ public class Section {
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
}
@ThenAction
public void addRedaction(@Argument(ArgumentType.STRING) String value,
@Argument(ArgumentType.TYPE) String asType,
public void addRedaction(@Argument(ArgumentType.STRING) String value, @Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
@ -365,9 +413,9 @@ public class Section {
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
}
@ThenAction
public void redactLineAfter(@Argument(ArgumentType.STRING) String start,
@Argument(ArgumentType.TYPE) String asType,
public void redactLineAfter(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
@Argument(ArgumentType.STRING) String reason,
@ -389,6 +437,7 @@ public class Section {
}
}
@ThenAction
public void recommendLineAfter(@Argument(ArgumentType.STRING) String start,
@Argument(ArgumentType.TYPE) String asType) {
@ -414,11 +463,11 @@ public class Section {
}
}
@ThenAction
public void redactByRegEx(@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
@ -436,6 +485,7 @@ public class Section {
}
}
@ThenAction
public void addRecommendationByRegEx(@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@ -454,6 +504,7 @@ public class Section {
}
}
@ThenAction
public void redactAndRecommendByRegEx(@Argument(ArgumentType.REGEX) String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@ -476,9 +527,9 @@ public class Section {
}
}
@ThenAction
public void redactBetween(@Argument(ArgumentType.STRING) String start,
@Argument(ArgumentType.STRING) String stop,
public void redactBetween(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.STRING) String stop,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
@ -502,6 +553,7 @@ public class Section {
}
}
@ThenAction
public void redactLinesBetween(@Argument(ArgumentType.STRING) String start,
@Argument(ArgumentType.STRING) String stop,
@ -536,6 +588,7 @@ public class Section {
}
}
@ThenAction
public void highlightCell(@Argument(ArgumentType.STRING) String cellHeader,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@ -544,10 +597,10 @@ public class Section {
annotateCell(cellHeader, ruleNumber, type, false, false, null, null);
}
@ThenAction
public void redactCell(@Argument(ArgumentType.STRING) String cellHeader,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.TYPE) String type,
@Argument(ArgumentType.BOOLEAN) boolean addAsRecommendations,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
@ -555,6 +608,7 @@ public class Section {
annotateCell(cellHeader, ruleNumber, type, true, addAsRecommendations, reason, legalBasis);
}
@ThenAction
public void redactNotCell(@Argument(ArgumentType.STRING) String cellHeader,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@ -641,6 +695,7 @@ public class Section {
}
}
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
public @interface WhenCondition {
@ -658,6 +713,7 @@ public class Section {
public @interface Argument {
ArgumentType value() default ArgumentType.STRING;
}
}

View File

@ -1,5 +1,17 @@
package com.iqser.red.service.redaction.v1.server.redaction.utils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
@ -10,10 +22,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Slf4j
@UtilityClass
@SuppressWarnings("PMD")
@ -37,8 +45,7 @@ public class EntitySearchUtils {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
return true;
}
} while (startIndex > -1);
@ -66,8 +73,7 @@ public class EntitySearchUtils {
startIndex = inputString.indexOf(cleanValue, stopIndex);
stopIndex = startIndex + cleanValue.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, isDictionaryEntry, isDossierDictionary, engine));
}
} while (startIndex > -1);
@ -121,8 +127,7 @@ public class EntitySearchUtils {
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
.getSectionNumber() == inner.getSectionNumber()) {
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
wordsToRemove.add(inner);
}
}
@ -141,14 +146,14 @@ public class EntitySearchUtils {
if (entities.contains(found)) {
Optional<Entity> existingOptional = entities.stream().filter(entity -> entity.equals(found)).findFirst();
if(!existingOptional.isPresent()){
if (!existingOptional.isPresent()) {
return;
}
var existing = existingOptional.get();
if(existing.getType().equals(found.getType())){
if (existing.getType().equals(found.getType())) {
existing.getEngines().addAll(found.getEngines());
} else if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){
} else if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
entities.remove(found);
entities.add(found);
}
@ -165,12 +170,14 @@ public class EntitySearchUtils {
}
public void addOrAddEngine(Set<Entity> existing, Set<Entity> toBeAdded){
public void addOrAddEngine(Set<Entity> existing, Set<Entity> toBeAdded) {
for(Entity toAdd: toBeAdded){
for (Entity toAdd : toBeAdded) {
if (existing.contains(toAdd)) {
Optional<Entity> existingOptional = existing.stream().filter(entity -> entity.equals(toAdd)).findFirst();
if(!existingOptional.isPresent()){
Optional<Entity> existingOptional = existing.stream()
.filter(entity -> entity.equals(toAdd))
.findFirst();
if (!existingOptional.isPresent()) {
return;
}
var existingEntity = existingOptional.get();
@ -181,4 +188,20 @@ public class EntitySearchUtils {
}
}
public Set<Entity> findNonOverlappingMatchEntities(Set<Entity> existingEntities, Set<Entity> foundEntities) {
Set<Entity> result = new HashSet<>();
if (existingEntities != null && foundEntities != null) {
for (Entity existingEntity : existingEntities) {
for (Entity foundEntity : foundEntities) {
if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) {
result.add(foundEntity);
}
}
}
}
return result;
}
}

View File

@ -27,4 +27,164 @@ public class EntitySearchUtilsTest {
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedEnd() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE);
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
Entity foundEntities2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(2);
assertThat(result).contains(foundEntities1);
assertThat(result).contains(foundEntities2);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedStartAndEndOverlapping() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
Entity foundEntities2 = new Entity("X. Superman Y.", "fake type", 7, 20, "fake headline", 0, false, false, Engine.RULE);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities1);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedStartAndEnd() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
Entity foundEntities2 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities1);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedExistingAndExpandedEnd() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE);
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities2);
}
/*
* Text: Batman X. Superman Y.
* Position: 0123456789
* 0123456789
* 0123456789
*/
@Test
public void testNotOverlappingEntitiesExpandedEndLong() {
// Arrange
Set<Entity> existingEntities = new HashSet<>();
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE);
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE);
existingEntities.add(existingEntity1);
existingEntities.add(existingEntity2);
Set<Entity> foundEntities = new HashSet<>();
Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE);
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
foundEntities.add(foundEntities1);
foundEntities.add(foundEntities2);
// Act
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
// Assert
assertThat(result.size()).isEqualTo(1);
assertThat(result).contains(foundEntities2);
}
}

View File

@ -1,3 +1,6 @@
Foo
F. Bar
B. Foo
Johnson R |
Weissler M S and Butters C A
AD Hurt

View File

@ -19,8 +19,8 @@ rule "0: Expand CBI Authors with firstname initials"
when
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
then
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
end