From e0dd06c6bf64bccff41f425cd934ebbe934384cf Mon Sep 17 00:00:00 2001 From: deiflaender Date: Wed, 5 Oct 2022 11:32:33 +0200 Subject: [PATCH] RED-5295: Added redactWordPartByRegEx rule function --- .../v1/server/redaction/model/Section.java | 73 +++++++++++++++++-- 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index e075b378..ff0414e1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -9,6 +9,7 @@ import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -24,8 +25,11 @@ import com.iqser.red.service.redaction.v1.model.Engine; import com.iqser.red.service.redaction.v1.model.FileAttribute; import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.FindEntityDetails; +import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; import com.iqser.red.service.redaction.v1.server.redaction.utils.SearchImplementation; @@ -607,7 +611,7 @@ public class Section { String startValue = getFirstRexExMatch(searchText, startPattern, startPatternCaseInsensitive, startGroup); - if (startValue == null){ + if (startValue == null) { return; } @@ -942,13 +946,10 @@ public class Section { @Argument(ArgumentType.STRING) String reason, @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { - - - if (!headline.isBlank()) { String cleanHeadline = headline.replaceAll("\\n", " ").replaceAll(" ", " ").trim(); - if(searchText.contains(cleanHeadline)) { + if (searchText.contains(cleanHeadline)) { Set found = findEntities(cleanHeadline, type, false, true, ruleNumber, reason, legalBasis, Engine.RULE, false); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); } @@ -956,6 +957,64 @@ public class Section { } + @ThenAction + public void redactWordPartByRegEx(@Argument(ArgumentType.REGEX)String pattern, + @Argument(ArgumentType.BOOLEAN) boolean patternCaseInsensitive, + @Argument(ArgumentType.INTEGER) int group, + @Argument(ArgumentType.INTEGER) int redactGroup, + @Argument(ArgumentType.TYPE) String asType, + @Argument(ArgumentType.RULE_NUMBER) int ruleNumber, + @Argument(ArgumentType.STRING) String reason, + @Argument(ArgumentType.LEGAL_BASIS) String legalBasis) { + + Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); + + Matcher findMatcher = compiledPattern.matcher(searchText); + + while (findMatcher.find()) { + String findMatch = findMatcher.group(group); + if (StringUtils.isNotBlank(findMatch)) { + Set found = findEntities(findMatch.trim(), asType, false, true, ruleNumber, reason, legalBasis, Engine.RULE, false); + + for (Entity entity : found) { + + Matcher redactMatcher = compiledPattern.matcher(entity.getWord()); + + while (redactMatcher.find()) { + String redactMatch = redactMatcher.group(redactGroup); + int start = redactMatcher.start(redactGroup); + + int i = 0; + List newPositions = new ArrayList<>(); + TextPositionSequence newSeq = null; + + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + for (TextPositionSequence textPositionSequence : entityPositionSequence.getSequences()) { + for (RedTextPosition textPosition : textPositionSequence.getTextPositions()) { + if (i >= start && i < start + redactMatch.length()) { + newPositions.add(textPosition); + if (newSeq == null) { + newSeq = textPositionSequence; + } + } + i++; + } + } + } + + newSeq.setTextPositions(newPositions); + entity.setWord(redactMatch); + String plainId = IdBuilder.buildId(List.of(newSeq)); + entity.setPositionSequences(List.of(new EntityPositionSequence(plainId, List.of(newSeq), newSeq.getPage()))); + + EntitySearchUtils.addEntitiesWithHigherRank(entities, entity, dictionary); + } + } + } + } + } + + @ThenAction @SuppressWarnings("unused") public void redactSection(@Argument(ArgumentType.TYPE) String type, @@ -1009,9 +1068,6 @@ public class Section { } - - - private Set findEntities(String value, String asType, boolean caseInsensitive, boolean redacted, int ruleNumber, String reason, String legalBasis, Engine engine, boolean asRecommendation) { @@ -1150,6 +1206,7 @@ public class Section { } + private void redactByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType, int ruleNumber, String reason, String legalBasis, boolean redaction) {