Pull request #482: RED-5295: Added redactWordPartByRegEx rule function

Merge in RED/redaction-service from RED-5295 to master

* commit 'e0dd06c6bf64bccff41f425cd934ebbe934384cf':
  RED-5295: Added redactWordPartByRegEx rule function
This commit is contained in:
Dominique Eiflaender 2022-10-12 10:49:11 +02:00
commit 69540bcd5e

View File

@ -9,6 +9,7 @@ import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -24,8 +25,11 @@ import com.iqser.red.service.redaction.v1.model.Engine;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails;
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation;
@ -606,7 +610,7 @@ public class Section {
String startValue = getFirstRexExMatch(searchText, startPattern, startPatternCaseInsensitive, startGroup);
if (startValue == null){
if (startValue == null) {
return;
}
@ -941,13 +945,10 @@ public class Section {
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
if (!headline.isBlank()) {
String cleanHeadline = headline.replaceAll("\\n", " ").replaceAll(" ", " ").trim();
if(searchText.contains(cleanHeadline)) {
if (searchText.contains(cleanHeadline)) {
Set<Entity> found = findEntities(cleanHeadline, type, false, true, ruleNumber, reason, legalBasis, Engine.RULE, false);
EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary);
}
@ -955,6 +956,64 @@ public class Section {
}
@ThenAction
public void redactWordPartByRegEx(@Argument(ArgumentType.REGEX)String pattern,
@Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive,
@Argument(ArgumentType.INTEGER) int group,
@Argument(ArgumentType.INTEGER) int redactGroup,
@Argument(ArgumentType.TYPE) String asType,
@Argument(ArgumentType.RULE_NUMBER) int ruleNumber,
@Argument(ArgumentType.STRING) String reason,
@Argument(ArgumentType.LEGAL_BASIS) String legalBasis) {
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
Matcher findMatcher = compiledPattern.matcher(searchText);
while (findMatcher.find()) {
String findMatch = findMatcher.group(group);
if (StringUtils.isNotBlank(findMatch)) {
Set<Entity> found = findEntities(findMatch.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE, false);
for (Entity entity : found) {
Matcher redactMatcher = compiledPattern.matcher(entity.getWord());
while (redactMatcher.find()) {
String redactMatch = redactMatcher.group(redactGroup);
int start = redactMatcher.start(redactGroup);
int i = 0;
List<RedTextPosition> newPositions = new ArrayList<>();
TextPositionSequence newSeq = null;
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
for (TextPositionSequence textPositionSequence : entityPositionSequence.getSequences()) {
for (RedTextPosition textPosition : textPositionSequence.getTextPositions()) {
if (i >= start && i < start + redactMatch.length()) {
newPositions.add(textPosition);
if (newSeq == null) {
newSeq = textPositionSequence;
}
}
i++;
}
}
}
newSeq.setTextPositions(newPositions);
entity.setWord(redactMatch);
String plainId = IdBuilder.buildId(List.of(newSeq));
entity.setPositionSequences(List.of(new EntityPositionSequence(plainId, List.of(newSeq), newSeq.getPage())));
EntitySearchUtils.addEntitiesWithHigherRank(entities, entity, dictionary);
}
}
}
}
}
@ThenAction
@SuppressWarnings("unused")
public void redactSection(@Argument(ArgumentType.TYPE) String type,
@ -1008,9 +1067,6 @@ public class Section {
}
private Set<Entity> findEntities(String value, String asType, boolean caseInsensitive, boolean redacted,
int ruleNumber, String reason, String legalBasis, Engine engine,
boolean asRecommendation) {
@ -1149,6 +1205,7 @@ public class Section {
}
private void redactByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType, int ruleNumber,
String reason, String legalBasis, boolean redaction) {