Pull request #286: Bugfix/RED-2845 3.0
Merge in RED/redaction-service from bugfix/RED-2845_3.0 to master * commit '1c982a40422a16e9caa8d264e4221524c7e1c92b': RED-2845 Bugfix: Avoid the expansion if it would result in a redaction overlap and bugfix in RegExp in drools
This commit is contained in:
commit
cda8e51d9d
@ -1,26 +1,34 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.ArgumentType;
|
||||
import com.iqser.red.service.redaction.v1.model.Engine;
|
||||
import com.iqser.red.service.redaction.v1.model.FileAttribute;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
|
||||
|
||||
@Data
|
||||
@Slf4j
|
||||
@ -62,39 +70,69 @@ public class Section {
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByIdEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> id.equals(attribute.getId()) && value.equals(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByPlaceholderEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equals(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByLabelEquals(@Argument(ArgumentType.FILE_ATTRIBUTE) String label,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> label.equals(attribute.getLabel()) && value.equals(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByIdEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String id,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> id.equals(attribute.getId()) && value.equalsIgnoreCase(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByPlaceholderEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String placeholder,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> placeholder.equals(attribute.getPlaceholder()) && value.equalsIgnoreCase(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean fileAttributeByLabelEqualsIgnoreCase(@Argument(ArgumentType.FILE_ATTRIBUTE) String label,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
return fileAttributes != null && fileAttributes.stream().filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue())).findFirst().isPresent();
|
||||
|
||||
return fileAttributes != null && fileAttributes.stream()
|
||||
.filter(attribute -> label.equals(attribute.getLabel()) && value.equalsIgnoreCase(attribute.getValue()))
|
||||
.findFirst()
|
||||
.isPresent();
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean rowEquals(@Argument(ArgumentType.STRING) String headerName,
|
||||
@Argument(ArgumentType.STRING) String value) {
|
||||
@ -106,6 +144,7 @@ public class Section {
|
||||
.equals(value);
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean hasTableHeader(@Argument(ArgumentType.STRING) String headerName) {
|
||||
|
||||
@ -113,18 +152,21 @@ public class Section {
|
||||
return tabularData != null && tabularData.containsKey(cleanHeaderName);
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean matchesType(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
return entities.stream().anyMatch(entity -> entity.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean matchesImageType(@Argument(ArgumentType.TYPE) String type) {
|
||||
|
||||
return images.stream().anyMatch(image -> image.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
@WhenCondition
|
||||
public boolean headlineContainsWord(@Argument(ArgumentType.STRING) String word) {
|
||||
|
||||
@ -133,16 +175,16 @@ public class Section {
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.REGEX) String pattern,
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@Argument(ArgumentType.INTEGER) int group) {
|
||||
|
||||
expandByRegEx(type, pattern, patternCaseInsensitive, group, null);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.REGEX) String pattern,
|
||||
public void expandByRegEx(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.REGEX) String pattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@Argument(ArgumentType.INTEGER) int group,
|
||||
@Argument(ArgumentType.REGEX) String withoutPattern) {
|
||||
@ -162,7 +204,7 @@ public class Section {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(withoutPattern != null) {
|
||||
if (withoutPattern != null) {
|
||||
Matcher matcherWithout = compiledWithoutPattern.matcher(entity.getWord());
|
||||
if (matcherWithout.find()) {
|
||||
continue;
|
||||
@ -173,10 +215,10 @@ public class Section {
|
||||
|
||||
while (matcher.find()) {
|
||||
String match = matcher.group(group);
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
expanded.addAll(findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity
|
||||
.getRedactionReason(), entity.getLegalBasis()));
|
||||
|
||||
if (StringUtils.isNotBlank(match)) {
|
||||
Set<Entity> expandedEntities = findEntities(entity.getWord() + match, type, false, entity.isRedaction(), entity.getMatchedRule(), entity.getRedactionReason(), entity.getLegalBasis());
|
||||
expanded.addAll(EntitySearchUtils.findNonOverlappingMatchEntities(entities, expandedEntities));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -185,6 +227,7 @@ public class Section {
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactImage(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@ -201,9 +244,9 @@ public class Section {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redact(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
public void redact(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
|
||||
|
||||
@ -220,6 +263,7 @@ public class Section {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactNotImage(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@ -234,9 +278,9 @@ public class Section {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactNot(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
public void redactNot(@Argument(ArgumentType.TYPE) String type, @Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason) {
|
||||
|
||||
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
|
||||
@ -260,12 +304,14 @@ public class Section {
|
||||
@ThenAction
|
||||
public void redactNotAndReference(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.REFERENCE_TYPE) String referenceType,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason) {
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason) {
|
||||
|
||||
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
|
||||
|
||||
Set<Entity> references = entities.stream().filter(entity -> entity.getType().equals(referenceType)).collect(Collectors.toSet());
|
||||
Set<Entity> references = entities.stream()
|
||||
.filter(entity -> entity.getType().equals(referenceType))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
entities.forEach(entity -> {
|
||||
if (entity.getType().equals(type) || hasRecommendationDictionary && entity.getType()
|
||||
@ -279,7 +325,6 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void expandToHintAnnotationByRegEx(@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.STRING) String pattern,
|
||||
@ -310,6 +355,7 @@ public class Section {
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void addHintAnnotationByRegEx(@Argument(ArgumentType.REGEX) String pattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@ -329,6 +375,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactIfPrecededBy(@Argument(ArgumentType.STRING) String prefix,
|
||||
@Argument(ArgumentType.TYPE) String type,
|
||||
@ -346,6 +393,7 @@ public class Section {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void addHintAnnotation(@Argument(ArgumentType.STRING) String value,
|
||||
@Argument(ArgumentType.TYPE) String asType) {
|
||||
@ -354,9 +402,9 @@ public class Section {
|
||||
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void addRedaction(@Argument(ArgumentType.STRING) String value,
|
||||
@Argument(ArgumentType.TYPE) String asType,
|
||||
public void addRedaction(@Argument(ArgumentType.STRING) String value, @Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
|
||||
@ -365,9 +413,9 @@ public class Section {
|
||||
EntitySearchUtils.addEntitiesIgnoreRank(entities, found);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactLineAfter(@Argument(ArgumentType.STRING) String start,
|
||||
@Argument(ArgumentType.TYPE) String asType,
|
||||
public void redactLineAfter(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@ -389,6 +437,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void recommendLineAfter(@Argument(ArgumentType.STRING) String start,
|
||||
@Argument(ArgumentType.TYPE) String asType) {
|
||||
@ -414,11 +463,11 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactByRegEx(@Argument(ArgumentType.REGEX) String pattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@Argument(ArgumentType.INTEGER) int group,
|
||||
@Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.INTEGER) int group, @Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
|
||||
@ -436,6 +485,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void addRecommendationByRegEx(@Argument(ArgumentType.REGEX) String pattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@ -454,6 +504,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactAndRecommendByRegEx(@Argument(ArgumentType.REGEX) String pattern,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
|
||||
@ -476,9 +527,9 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactBetween(@Argument(ArgumentType.STRING) String start,
|
||||
@Argument(ArgumentType.STRING) String stop,
|
||||
public void redactBetween(@Argument(ArgumentType.STRING) String start, @Argument(ArgumentType.STRING) String stop,
|
||||
@Argument(ArgumentType.TYPE) String asType,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean redactEverywhere,
|
||||
@ -502,6 +553,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactLinesBetween(@Argument(ArgumentType.STRING) String start,
|
||||
@Argument(ArgumentType.STRING) String stop,
|
||||
@ -536,6 +588,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void highlightCell(@Argument(ArgumentType.STRING) String cellHeader,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@ -544,10 +597,10 @@ public class Section {
|
||||
annotateCell(cellHeader, ruleNumber, type, false, false, null, null);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactCell(@Argument(ArgumentType.STRING) String cellHeader,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber, @Argument(ArgumentType.TYPE) String type,
|
||||
@Argument(ArgumentType.BOOLEAN) boolean addAsRecommendations,
|
||||
@Argument(ArgumentType.STRING) String reason,
|
||||
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
|
||||
@ -555,6 +608,7 @@ public class Section {
|
||||
annotateCell(cellHeader, ruleNumber, type, true, addAsRecommendations, reason, legalBasis);
|
||||
}
|
||||
|
||||
|
||||
@ThenAction
|
||||
public void redactNotCell(@Argument(ArgumentType.STRING) String cellHeader,
|
||||
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
|
||||
@ -641,6 +695,7 @@ public class Section {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.METHOD)
|
||||
public @interface WhenCondition {
|
||||
@ -658,6 +713,7 @@ public class Section {
|
||||
public @interface Argument {
|
||||
|
||||
ArgumentType value() default ArgumentType.STRING;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,17 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrementValue;
|
||||
@ -10,10 +22,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
@SuppressWarnings("PMD")
|
||||
@ -37,8 +45,7 @@ public class EntitySearchUtils {
|
||||
startIndex = inputString.indexOf(cleanValue, stopIndex);
|
||||
stopIndex = startIndex + cleanValue.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
return true;
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
@ -66,8 +73,7 @@ public class EntitySearchUtils {
|
||||
startIndex = inputString.indexOf(cleanValue, stopIndex);
|
||||
stopIndex = startIndex + cleanValue.length();
|
||||
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString
|
||||
.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString.charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) {
|
||||
found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, isDictionaryEntry, isDossierDictionary, engine));
|
||||
}
|
||||
} while (startIndex > -1);
|
||||
@ -121,8 +127,7 @@ public class EntitySearchUtils {
|
||||
for (Entity word : entities) {
|
||||
for (Entity inner : entities) {
|
||||
if (inner.getWord().length() < word.getWord()
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
|
||||
.getSectionNumber() == inner.getSectionNumber()) {
|
||||
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
|
||||
wordsToRemove.add(inner);
|
||||
}
|
||||
}
|
||||
@ -141,14 +146,14 @@ public class EntitySearchUtils {
|
||||
|
||||
if (entities.contains(found)) {
|
||||
Optional<Entity> existingOptional = entities.stream().filter(entity -> entity.equals(found)).findFirst();
|
||||
if(!existingOptional.isPresent()){
|
||||
if (!existingOptional.isPresent()) {
|
||||
return;
|
||||
}
|
||||
var existing = existingOptional.get();
|
||||
|
||||
if(existing.getType().equals(found.getType())){
|
||||
if (existing.getType().equals(found.getType())) {
|
||||
existing.getEngines().addAll(found.getEngines());
|
||||
} else if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())){
|
||||
} else if (dictionary.getDictionaryRank(existing.getType()) <= dictionary.getDictionaryRank(found.getType())) {
|
||||
entities.remove(found);
|
||||
entities.add(found);
|
||||
}
|
||||
@ -165,12 +170,14 @@ public class EntitySearchUtils {
|
||||
}
|
||||
|
||||
|
||||
public void addOrAddEngine(Set<Entity> existing, Set<Entity> toBeAdded){
|
||||
public void addOrAddEngine(Set<Entity> existing, Set<Entity> toBeAdded) {
|
||||
|
||||
for(Entity toAdd: toBeAdded){
|
||||
for (Entity toAdd : toBeAdded) {
|
||||
if (existing.contains(toAdd)) {
|
||||
Optional<Entity> existingOptional = existing.stream().filter(entity -> entity.equals(toAdd)).findFirst();
|
||||
if(!existingOptional.isPresent()){
|
||||
Optional<Entity> existingOptional = existing.stream()
|
||||
.filter(entity -> entity.equals(toAdd))
|
||||
.findFirst();
|
||||
if (!existingOptional.isPresent()) {
|
||||
return;
|
||||
}
|
||||
var existingEntity = existingOptional.get();
|
||||
@ -181,4 +188,20 @@ public class EntitySearchUtils {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> findNonOverlappingMatchEntities(Set<Entity> existingEntities, Set<Entity> foundEntities) {
|
||||
|
||||
Set<Entity> result = new HashSet<>();
|
||||
if (existingEntities != null && foundEntities != null) {
|
||||
for (Entity existingEntity : existingEntities) {
|
||||
for (Entity foundEntity : foundEntities) {
|
||||
if (existingEntity.getEnd() < foundEntity.getStart() || foundEntity.getEnd() < existingEntity.getStart()) {
|
||||
result.add(foundEntity);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -27,4 +27,164 @@ public class EntitySearchUtilsTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedEnd() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity foundEntities2 = new Entity("Superman Y.", "fake type", 10, 20, "fake headline", 0, false, false, Engine.RULE);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(2);
|
||||
assertThat(result).contains(foundEntities1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedStartAndEndOverlapping() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity foundEntities2 = new Entity("X. Superman Y.", "fake type", 7, 20, "fake headline", 0, false, false, Engine.RULE);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedStartAndEnd() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity existingEntity2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity foundEntities2 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedExistingAndExpandedEnd() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X.", "fake type", 0, 8, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Text: Batman X. Superman Y.
|
||||
* Position: 0123456789
|
||||
* 0123456789
|
||||
* 0123456789
|
||||
*/
|
||||
@Test
|
||||
public void testNotOverlappingEntitiesExpandedEndLong() {
|
||||
|
||||
// Arrange
|
||||
Set<Entity> existingEntities = new HashSet<>();
|
||||
Entity existingEntity1 = new Entity("X. Superman", "fake type", 7, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity existingEntity2 = new Entity("Batman", "fake type", 0, 5, "fake headline", 0, false, false, Engine.RULE);
|
||||
existingEntities.add(existingEntity1);
|
||||
existingEntities.add(existingEntity2);
|
||||
|
||||
Set<Entity> foundEntities = new HashSet<>();
|
||||
Entity foundEntities1 = new Entity("Batman X. Superman", "fake type", 0, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
Entity foundEntities2 = new Entity("Superman", "fake type", 10, 17, "fake headline", 0, false, false, Engine.RULE);
|
||||
foundEntities.add(foundEntities1);
|
||||
foundEntities.add(foundEntities2);
|
||||
|
||||
// Act
|
||||
Set<Entity> result = EntitySearchUtils.findNonOverlappingMatchEntities(existingEntities, foundEntities);
|
||||
|
||||
// Assert
|
||||
assertThat(result.size()).isEqualTo(1);
|
||||
assertThat(result).contains(foundEntities2);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,3 +1,6 @@
|
||||
Foo
|
||||
F. Bar
|
||||
B. Foo
|
||||
Johnson R |
|
||||
Weissler M S and Butters C A
|
||||
AD Hurt
|
||||
|
||||
@ -19,8 +19,8 @@ rule "0: Expand CBI Authors with firstname initials"
|
||||
when
|
||||
Section(matchesType("CBI_author") || matchesType("recommendation_CBI_author"))
|
||||
then
|
||||
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
|
||||
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[^\\s]+");
|
||||
section.expandByRegEx("CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
|
||||
section.expandByRegEx("recommendation_CBI_author", "(,? [A-Z]\\.?( ?[A-Z]\\.?)?( ?[A-Z]\\.?)?\\b\\.?)", false, 1, "[\\s]+");
|
||||
end
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user