From de725a630c06fbc6c6bef6229fcf3ba5c859ef73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Wed, 6 Jan 2021 14:41:31 +0100 Subject: [PATCH] RED-727: Added possibility to redact/addRecommendations by regEx in rules. Added email regEx and et al. author recommendation regEx --- .../v1/server/redaction/model/Section.java | 170 +++++----- .../service/EntityRedactionService.java | 59 ++-- ...sitionUtil.java => EntitySearchUtils.java} | 41 ++- .../v1/server/redaction/utils/Patterns.java | 20 +- .../v1/server/RedactionIntegrationTest.java | 2 +- .../service/EntityRedactionServiceTest.java | 30 +- .../redaction/utils/RegExPatternTest.java | 95 ++++++ .../src/test/resources/drools/rules.drl | 290 ++++++++++-------- 8 files changed, 424 insertions(+), 283 deletions(-) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/{PositionUtil.java => EntitySearchUtils.java} (65%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/RegExPatternTest.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 0d6d497b..b8f0e152 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -14,8 +14,8 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; -import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil; import lombok.Builder; import lombok.Data; @@ -76,10 +76,10 @@ public class Section { public void redact(String type, int ruleNumber, String reason, String legalBasis) { - boolean hasRecommendactionDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); + boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); entities.forEach(entity -> { - if (entity.getType().equals(type) || hasRecommendactionDictionary && entity.getType() + if (entity.getType().equals(type) || hasRecommendationDictionary && entity.getType() .equals(RECOMMENDATION_PREFIX + type)) { entity.setRedaction(true); entity.setMatchedRule(ruleNumber); @@ -92,10 +92,10 @@ public class Section { public void redactNot(String type, int ruleNumber, String reason) { - boolean hasRecommendactionDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); + boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); entities.forEach(entity -> { - if (entity.getType().equals(type) || hasRecommendactionDictionary && entity.getType() + if (entity.getType().equals(type) || hasRecommendationDictionary && entity.getType() .equals(RECOMMENDATION_PREFIX + type)) { entity.setRedaction(false); entity.setMatchedRule(ruleNumber); @@ -120,8 +120,8 @@ public class Section { public void addHintAnnotation(String value, String asType) { - Set found = findEntities(value.trim(), asType, true); - entities.addAll(found); + Set found = findEntities(value.trim(), asType, true, false, 0, null, null); + addNewerToEntities(found); } @@ -133,24 +133,41 @@ public class Section { if (values != null) { for (String value : values) { if (StringUtils.isNotBlank(value)) { - Set found = findEntities(value.trim(), asType, false); - // HashSet keeps the older value, but we want the new only. - entities.removeAll(found); - entities.addAll(found); + Set found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis); + addNewerToEntities(found); } } } + } - // TODO No need to iterate - entities.forEach(entity -> { - if (entity.getType().equals(asType)) { - entity.setRedaction(true); - entity.setMatchedRule(ruleNumber); - entity.setRedactionReason(reason); - entity.setLegalBasis(legalBasis); + + public void redactByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType, int ruleNumber, String reason, String legalBasis) { + Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); + + Matcher matcher = compiledPattern.matcher(text); + + while (matcher.find()) { + String match = matcher.group(group); + if (StringUtils.isNotBlank(match)) { + Set found = findEntities(match.trim(), asType, false, true, ruleNumber, reason, legalBasis); + addNewerToEntities(found); } - }); + } + } + + public void addRecommendationByRegEx(String pattern, boolean patternCaseInsensitive, int group, String asType) { + Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); + + Matcher matcher = compiledPattern.matcher(text); + + while (matcher.find()) { + String match = matcher.group(group); + if (StringUtils.isNotBlank(match) && match.length() >= 3) { + localDictionaryAdds.computeIfAbsent(RECOMMENDATION_PREFIX + asType, (x) -> new HashSet<>()) + .add(match); + } + } } @@ -162,30 +179,21 @@ public class Section { if (values != null) { for (String value : values) { if (StringUtils.isNotBlank(value)) { - Set found = findEntities(value.trim(), asType, false); - // HashSet keeps the older value, but we want the new only. - entities.removeAll(found); - entities.addAll(found); + + Set found = findEntities(value.trim(), asType, false, true, ruleNumber, reason, legalBasis); + addNewerToEntities(found); + if (redactEverywhere && !isLocal()) { localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(value.trim()); } } } } - - // TODO No need to iterate - entities.forEach(entity -> { - if (entity.getType().equals(asType)) { - entity.setRedaction(true); - entity.setMatchedRule(ruleNumber); - entity.setRedactionReason(reason); - entity.setLegalBasis(legalBasis); - } - }); } - public void redactLinesBetween(String start, String stop, String asType, int ruleNumber, boolean redactEverywhere, + public void redactLinesBetween(String start, String stop, String asType, int ruleNumber, + boolean redactEverywhere, String reason, String legalBasis) { String[] values = StringUtils.substringsBetween(text, start, stop); @@ -201,11 +209,9 @@ public class Section { return; } - Set found = findEntities(line.trim(), asType, false); + Set found = findEntities(line.trim(), asType, false, true, ruleNumber, reason, legalBasis); + addNewerToEntities(found); - // HashSet keeps the older value, but we want the new only. - entities.removeAll(found); - entities.addAll(found); if (redactEverywhere && !isLocal()) { localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(line.trim()); } @@ -213,49 +219,6 @@ public class Section { } } } - - // TODO No need to iterate - entities.forEach(entity -> { - if (entity.getType().equals(asType)) { - entity.setRedaction(true); - entity.setMatchedRule(ruleNumber); - entity.setRedactionReason(reason); - entity.setLegalBasis(legalBasis); - } - }); - } - - - private Set findEntities(String value, String asType, boolean caseinsensitive) { - - if (value.trim().length() <= 2) { - return new HashSet<>(); - } - - Set found = new HashSet<>(); - - String text = caseinsensitive ? searchText.toLowerCase() : searchText; - String searchValue = caseinsensitive ? value.toLowerCase() : value; - - int startIndex; - int stopIndex = 0; - do { - startIndex = text.indexOf(searchValue, stopIndex); - stopIndex = startIndex + searchValue.length(); - - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(text.charAt(startIndex - 1)) || isSeparator(text - .charAt(startIndex - 1))) && (stopIndex == text.length() || isSeparator(text.charAt(stopIndex)))) { - found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline, sectionNumber, false)); - } - } while (startIndex > -1); - - return PositionUtil.clearAndFindPositions(found, searchableText, dictionary); - } - - - private boolean isSeparator(char c) { - - return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; } @@ -265,7 +228,8 @@ public class Section { } - public void redactCell(String cellHeader, int ruleNumber, String type, boolean addAsRecommendations, String reason, + public void redactCell(String cellHeader, int ruleNumber, String type, boolean addAsRecommendations, String + reason, String legalBasis) { annotateCell(cellHeader, ruleNumber, type, true, addAsRecommendations, reason, legalBasis); @@ -279,6 +243,27 @@ public class Section { } + private Set findEntities(String value, String asType, boolean caseInsensitive, boolean redacted, + int ruleNumber, String reason, String legalBasis) { + + String text = caseInsensitive ? searchText.toLowerCase() : searchText; + String searchValue = caseInsensitive ? value.toLowerCase() : value; + + Set found = EntitySearchUtils.find(text, Set.of(searchValue), asType, headline, sectionNumber, true); + + found.forEach(entity -> { + if (redacted) { + entity.setRedaction(true); + entity.setMatchedRule(ruleNumber); + entity.setRedactionReason(reason); + entity.setLegalBasis(legalBasis); + } + }); + + return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary); + } + + private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, boolean addAsRecommendations, String reason, String legalBasis) { @@ -303,13 +288,11 @@ public class Section { Set singleEntitySet = new HashSet<>(); singleEntitySet.add(entity); - PositionUtil.clearAndFindPositions(singleEntitySet, searchableText, dictionary); + EntitySearchUtils.clearAndFindPositions(singleEntitySet, searchableText, dictionary); - // HashSet keeps the older value, but we want the new only. - entities.remove(entity); - entities.add(entity); + addNewerToEntities(entity); - PositionUtil.removeEntitiesContainedInLarger(entities); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); if (addAsRecommendations && !isLocal()) { String cleanedWord = word.replaceAll(",", " ").replaceAll(" ", " ").trim() + " "; @@ -330,6 +313,19 @@ public class Section { } } + + private void addNewerToEntities(Set found) { + // HashSet keeps the older value, but we want the new only. + entities.removeAll(found); + entities.addAll(found); + } + + private void addNewerToEntities(Entity found) { + // HashSet keeps the older value, but we want the new only. + entities.remove(found); + entities.add(found); + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 872075f8..371fbb6e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -1,5 +1,21 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.kie.api.runtime.KieContainer; +import org.springframework.stereotype.Service; + import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; import com.iqser.red.service.redaction.v1.model.ManualRedactions; import com.iqser.red.service.redaction.v1.model.Rectangle; @@ -14,26 +30,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionS import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.Section; import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; -import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.StringUtils; -import org.kie.api.runtime.KieContainer; -import org.springframework.stereotype.Service; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.util.stream.Stream; @Slf4j @Service @@ -70,7 +72,7 @@ public class EntityRedactionService { documentEntities.removeAll(foundByLocal); documentEntities.addAll(foundByLocal); - PositionUtil.removeEntitiesContainedInLarger(documentEntities); + EntitySearchUtils.removeEntitiesContainedInLarger(documentEntities); } for (Entity entity : documentEntities) { @@ -96,8 +98,9 @@ public class EntityRedactionService { } - private Set findEntities(Document classifiedDoc, KieContainer kieContainer, ManualRedactions manualRedactions, Dictionary dictionary, - boolean local, Map> hintsPerSectionNumber) { + private Set findEntities(Document classifiedDoc, KieContainer kieContainer, + ManualRedactions manualRedactions, Dictionary dictionary, boolean local, + Map> hintsPerSectionNumber) { Set documentEntities = new HashSet<>(); @@ -119,7 +122,7 @@ public class EntityRedactionService { } sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { - Section analysedRowSection = droolsExecutionService.executeRules(kieContainer,sectionSearchableTextPair.getSection()); + Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); documentEntities.addAll(analysedRowSection.getEntities()); analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> { @@ -152,7 +155,8 @@ public class EntityRedactionService { private List processTablePerRow(Table table, ManualRedactions manualRedactions, - AtomicInteger sectionNumber, Dictionary dictionary, boolean local, + AtomicInteger sectionNumber, Dictionary dictionary, + boolean local, Map> hintsPerSectionNumber) { List sectionSearchableTextPairs = new ArrayList<>(); @@ -233,8 +237,8 @@ public class EntityRedactionService { sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() .isLocal(local) .dictionaryTypes(dictionary.getTypes()) - .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream.concat(rowEntities - .stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream()) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream + .concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream()) .collect(Collectors.toSet()) : rowEntities) .text(entireTableText.getAsStringWithLinebreaks()) .searchText(entireTableText.toString()) @@ -253,15 +257,16 @@ public class EntityRedactionService { Map> hintsPerSectionNumber) { SearchableText searchableText = paragraph.getSearchableText(); - addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber.intValue()); + addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber + .intValue()); Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber.intValue(), dictionary, local); surroundingWordsService.addSurroundingText(entities, searchableText, dictionary); return new SectionSearchableTextPair(Section.builder() .isLocal(local) .dictionaryTypes(dictionary.getTypes()) - .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream.concat(entities - .stream(), hintsPerSectionNumber.get(sectionNumber).stream()) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream + .concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream()) .collect(Collectors.toSet()) : entities) .text(searchableText.getAsStringWithLinebreaks()) .searchText(searchableText.toString()) @@ -291,7 +296,7 @@ public class EntityRedactionService { } } - return PositionUtil.clearAndFindPositions(found, searchableText, dictionary); + return EntitySearchUtils.clearAndFindPositions(found, searchableText, dictionary); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java similarity index 65% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index 9f692592..5206eecc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/PositionUtil.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -3,9 +3,11 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; import java.util.stream.Collectors; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; @@ -18,7 +20,40 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @UtilityClass -public class PositionUtil { +public class EntitySearchUtils { + + public Set find(String inputString, Set values, String type, String headline, int sectionNumber, + boolean local) { + + Set found = new HashSet<>(); + + for (String value : values) { + + if (value.trim().length() <= 2) { + continue; + } + + int startIndex; + int stopIndex = 0; + do { + startIndex = inputString.indexOf(value, stopIndex); + stopIndex = startIndex + value.length(); + + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString + .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { + found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber, !local)); + } + } while (startIndex > -1); + } + return found; + } + + + private boolean isSeparator(char c) { + + return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; + } + public Set clearAndFindPositions(Set entities, SearchableText text, Dictionary dictionary) { @@ -41,7 +76,7 @@ public class PositionUtil { for (int i = 0; i <= orderedEntities.size() - 1; i++) { try { orderedEntities.get(i).setPositionSequences(List.of(positionSequences.get(i))); - } catch (Exception e){ + } catch (Exception e) { log.warn("Mismatch between EntityPositionSequence and found Entity!"); } } @@ -67,6 +102,4 @@ public class PositionUtil { } entities.removeAll(wordsToRemove); } - - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/Patterns.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/Patterns.java index 3e20e767..76a6e52f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/Patterns.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/Patterns.java @@ -1,12 +1,28 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; -import java.util.regex.Pattern; - import lombok.experimental.UtilityClass; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + @UtilityClass public class Patterns { + public static Map patternCache = new HashMap<>(); + public static Pattern AUTHOR_TABLE_SPITTER = Pattern.compile("((((di)|(van)) )|[A-Z]’)?[A-ZÄÖÜ][\\wäöüéèê]{2,}( ?[A-ZÄÖÜ]{1,2}\\.)+|((((di)|(van)) )|[A-Z]’)?[A-ZÄÖÜ][\\wäöüéèê]{2,}( ?[A-ZÄÖÜ]{1,2} )+"); + + public Pattern getCompiledPattern(String pattern, boolean caseInsensitive) { + + String patternKey = pattern + caseInsensitive; + if (patternCache.containsKey(patternKey)) { + return patternCache.get(patternKey); + } + Pattern compiledPattern = Pattern.compile(pattern, caseInsensitive ? Pattern.CASE_INSENSITIVE : 0); + patternCache.put(patternKey, compiledPattern); + return compiledPattern; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index e5328306..80cb3b14 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -390,7 +390,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf"); RedactionRequest request = RedactionRequest.builder() .ruleSetId(TEST_RULESET_ID) diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 3a3544e4..41b4ccd1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -10,7 +10,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; -import com.iqser.red.service.redaction.v1.server.redaction.utils.PositionUtil; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import org.apache.commons.io.IOUtils; @@ -110,7 +110,7 @@ public class EntityRedactionServiceTest { Entity nesting = new Entity("nesting nested", "fake type", 2, 16, "fake headline", 0, false); entities.add(nested); entities.add(nesting); - PositionUtil.removeEntitiesContainedInLarger(entities); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); assertThat(entities.size()).isEqualTo(1); assertThat(entities).contains(nesting); @@ -313,7 +313,7 @@ public class EntityRedactionServiceTest { assertThat(classifiedDoc.getEntities()).hasSize(1); // one page assertThat(classifiedDoc.getEntities().get(1).stream() .filter(entity -> entity.getMatchedRule() == 6) - .count()).isEqualTo(18); + .count()).isEqualTo(13); } } @@ -515,28 +515,4 @@ public class EntityRedactionServiceTest { } } - - @Test - public void testAuthorSplitting() { - - String word = "Porch JR, " + "Kendall TZ, " + "Krueger HO"; - - word.replaceAll(",", " ").replaceAll(" ", " "); - - Pattern pattern = Pattern.compile("[A-ZÄÖÜ][\\wäöüéèê]{2,}( [A-ZÄÖÜ]{1,2}\\.)+"); - Matcher matcher = pattern.matcher(word); - - List allMatches = new ArrayList<>(); - while (matcher.find()) { - allMatches.add(matcher.group()); - } - - for (String name : allMatches) { - if (name.length() >= 3) { - System.out.println(name); -// dictionaryService.addToLocalDictionary(type, name); - } - } - } - } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/RegExPatternTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/RegExPatternTest.java new file mode 100644 index 00000000..1f2ae55b --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/RegExPatternTest.java @@ -0,0 +1,95 @@ +package com.iqser.red.service.redaction.v1.server.redaction.utils; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RegExPatternTest { + + + @Test + public void testEmailRegEx(){ + String text = "Address: Schwarzwaldalle " + + "P.O.Box\n" + + "CH-4002 Basel\n" + + "Switzerland\n" + + "Contact: Christian Warmers\n" + + "Tel: +41 (61) 323 8044\n" + + "christian.warmers@syngenta.com"; + + + Pattern p = Pattern.compile("\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b", Pattern.CASE_INSENSITIVE); + + Matcher matcher = p.matcher(text); + + while (matcher.find()) { + String match = matcher.group(0); + System.out.println(match); + } + } + + + @Test + public void testEtAlRegEx() { + String text = "To assess the potential of S-metolachlor to cause endocrine disruption (ED) a review (Charlton 2014,\n" + + "ASB2016-762) was submitted that summarises results from regulatory and open scientific literature\n" + + "studies covering in vitro and in vivo studies (level 2-5 of the OECD Conceptual Framework). According to this information metolachlor increased (1.5-fold) aromatase activity in JEG-3 cells (Laville et al.\n" + + "2006, ASB2010-14391) and induced weak anti-androgenic activity in the MDA-kb2 reporter cell line\n" + + "with a IC50 of 9.92 µM (IC50 of positive control flutamide: 0.51 µM) (Aït-Aïssa et al. 2010, ASB2015-\n" + + "9562). Data from the Tox21 high throughput screening revealed just few postive findings in assays to\n" + + "identify antagonists of the androgen receptor. An isolated result of this screening showed agonistic\n" + + "activity on the thyroid stimulating hormone receptor, while Dalton et al. (2003, ASB2018-2832)\n" + + "demonstrated that metolachlor induced CYP2B1/2 and CYP3A1/2 but did not affect T4, T3 or TSH.\n" + + "After prepubertal exposure of male Wistar rats to metolachlor (Mathias et al. 2012, ASB2016-9890) a\n" + + "statistically significant increase of serum hormone concentration was observed for testosterone (at the\n" + + "dose 50 mg/kg) as well as a statistically significant decrease in the age of preputial separation at a dose\n" + + "of 5 and 50 mg/kg. Furthermore a statistically significant increase for estradiol at a dose of 50 mg/kg\n" + + "and for FSH at a dose of 5 and 50 mg/kg and morphological alterations of the seminiferous epithelium\n" + + "were observed. Relative testicular weight was not altered. A statistically significant increase of relative\n" + + "weights was observed in long-term studies with rats (Tisdel et al. 1983, TOX9800328 ). This finding\n" + + "was attributed to lower terminal body weight. In mice a statistically significant decrease of the weight\n" + + "seminal vesicle (Tisdel et al. 1982, TOX9800327) was shown after 24 month treatment with\n" + + "metolachlor. In a mouse preimplantation embryo assay from open literature metolachlor increased the\n" + + "percentage of apoptosis significantly and reduced the mean number of cells per embryo significantly\n" + + "while the percentage of developing blastocytes was unaltered (Grennlee et al. 2004, ASB2016-9889).\n" + + "In reproduvtive toxicity studies a retarded body weight development of the pups was observed, while\n" + + "survival and normal morphological and functional development were not altered. No adverse effects\n" + + "on male fertility were seen, however important parameters to assess effects on female fertility like\n" + + "cyclicity, ovarian follicles as well as developmental landmarks in the offspring have not been investigated."; + + Pattern p = Pattern.compile("([^\\s(]*?( \\w\\.?)?) et al\\.?"); + + Matcher matcher = p.matcher(text); + + while (matcher.find()) { + String match = matcher.group(1); + System.out.println(match); + } + } + + + @Test + public void testAuthorSplitting(){ + + String word = "Porch JR, " + "Kendall TZ, " + "Krueger HO"; + + word.replaceAll(",", " ").replaceAll(" ", " "); + + Pattern pattern = Pattern.compile("[A-ZÄÖÜ][\\wäöüéèê]{2,}( [A-ZÄÖÜ]{1,2}\\.)+"); + Matcher matcher = pattern.matcher(word); + + List allMatches = new ArrayList<>(); + while (matcher.find()) { + allMatches.add(matcher.group()); + } + + for(String name: allMatches) { + if(name.length() >= 3) { + System.out.println(name); + } + } + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 3684402e..8f6cfbdd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -8,48 +8,48 @@ global Section section // --------------------------------------- CBI rules ------------------------------------------------------------------- rule "1: Redacted because Section contains Vertebrate" - when - Section(matchesType("vertebrate")) - then - section.redact("CBI_author", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("CBI_address", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - end + when + Section(matchesType("vertebrate")) + then + section.redact("CBI_author", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end rule "2: Not Redacted because Section contains no Vertebrate" - when - Section(!matchesType("vertebrate")) - then - section.redactNot("CBI_author", 2, "No Vertebrate found"); - section.redactNot("CBI_address", 2, "No Vertebrate found"); - end + when + Section(!matchesType("vertebrate")) + then + section.redactNot("CBI_author", 2, "No Vertebrate found"); + section.redactNot("CBI_address", 2, "No Vertebrate found"); + end rule "3: Do not redact Names and Addresses if no redaction Indicator is contained" - when - Section(matchesType("vertebrate"), matchesType("no_redaction_indicator")) - then - section.redactNot("CBI_author", 3, "Vertebrate and No Redaction Indicator found"); - section.redactNot("CBI_address", 3, "Vertebrate and No Redaction Indicator found"); - end + when + Section(matchesType("vertebrate"), matchesType("no_redaction_indicator")) + then + section.redactNot("CBI_author", 3, "Vertebrate and No Redaction Indicator found"); + section.redactNot("CBI_address", 3, "Vertebrate and No Redaction Indicator found"); + end rule "4: Do not redact Names and Addresses if no redaction Indicator is contained" - when - Section(matchesType("vertebrate"), matchesType("published_information")) - then - section.redactNot("CBI_author", 4, "Vertebrate and Published Information found"); - section.redactNot("CBI_address", 4, "Vertebrate and Published Information found"); - end + when + Section(matchesType("vertebrate"), matchesType("published_information")) + then + section.redactNot("CBI_author", 4, "Vertebrate and Published Information found"); + section.redactNot("CBI_address", 4, "Vertebrate and Published Information found"); + end rule "5: Redact Names and Addresses if no_redaction_indicator and redaction_indicator is contained" - when - Section(matchesType("vertebrate"), matchesType("no_redaction_indicator"), matchesType("redaction_indicator")) - then - section.redact("CBI_author", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("CBI_address", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - end + when + Section(matchesType("vertebrate"), matchesType("no_redaction_indicator"), matchesType("redaction_indicator")) + then + section.redact("CBI_author", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end rule "6: Not redacted because Vertebrate Study = N" @@ -64,12 +64,12 @@ rule "6: Not redacted because Vertebrate Study = N" rule "7: Redact if must redact entry is found" - when - Section(matchesType("must_redact")) - then - section.redact("CBI_author", 7, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("CBI_address", 7, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - end + when + Section(matchesType("must_redact")) + then + section.redact("CBI_author", 7, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 7, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end rule "8: Redact Authors and Addresses in Reference Table if it is a Vertebrate study" @@ -92,55 +92,75 @@ rule "9: Redact sponsor company" rule "10: Redact determination of residues" - when - Section(searchText.toLowerCase.contains("determination of residues") && ( - searchText.toLowerCase.contains("livestock") || - searchText.toLowerCase.contains("live stock") || - searchText.toLowerCase.contains("tissue") || - searchText.toLowerCase.contains("liver") || - searchText.toLowerCase.contains("muscle") || - searchText.toLowerCase.contains("bovine") || - searchText.toLowerCase.contains("ruminant") || - searchText.toLowerCase.contains("ruminants") - )) - then - section.redact("CBI_author", 10, "Determination of residues was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("CBI_address", 10, "Determination of residues was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.addHintAnnotation("determination of residues", "must_redact"); - section.addHintAnnotation("livestock", "must_redact"); - section.addHintAnnotation("live stock", "must_redact"); - section.addHintAnnotation("tissue", "must_redact"); - section.addHintAnnotation("liver", "must_redact"); - section.addHintAnnotation("muscle", "must_redact"); - section.addHintAnnotation("bovine", "must_redact"); - section.addHintAnnotation("ruminant", "must_redact"); - section.addHintAnnotation("ruminants", "must_redact"); - end + when + Section(( + searchText.toLowerCase.contains("determination of residues") || + searchText.toLowerCase.contains("determination of total residues") + ) && ( + searchText.toLowerCase.contains("livestock") || + searchText.toLowerCase.contains("live stock") || + searchText.toLowerCase.contains("tissue") || + searchText.toLowerCase.contains("tissues") || + searchText.toLowerCase.contains("liver") || + searchText.toLowerCase.contains("muscle") || + searchText.toLowerCase.contains("bovine") || + searchText.toLowerCase.contains("ruminant") || + searchText.toLowerCase.contains("ruminants") + )) + then + section.redact("CBI_author", 10, "Determination of residues was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 10, "Determination of residues was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.addHintAnnotation("determination of residues", "must_redact"); + section.addHintAnnotation("livestock", "must_redact"); + section.addHintAnnotation("live stock", "must_redact"); + section.addHintAnnotation("tissue", "must_redact"); + section.addHintAnnotation("tissues", "must_redact"); + section.addHintAnnotation("liver", "must_redact"); + section.addHintAnnotation("muscle", "must_redact"); + section.addHintAnnotation("bovine", "must_redact"); + section.addHintAnnotation("ruminant", "must_redact"); + section.addHintAnnotation("ruminants", "must_redact"); + end rule "11: Redact if CTL/* or BL/* was found" when Section(searchText.contains("CTL/") || searchText.contains("BL/")) then - section.redact("CBI_author", 11, "Laboraty for vertebrate studies found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("CBI_address", 11, "Laboraty for vertebrate studies found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.addHintAnnotation("CTL", "must_redact"); - section.addHintAnnotation("BL", "must_redact"); + section.redact("CBI_author", 11, "Laboraty for vertebrate studies found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 11, "Laboraty for vertebrate studies found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.addHintAnnotation("CTL", "must_redact"); + section.addHintAnnotation("BL", "must_redact"); end +rule "12: Add recommendation for et al. author" + when + Section(searchText.contains("et al.")) + then + section.addRecommendationByRegEx("([^\\s(]*?( \\w\\.?)?) et al\\.?", false, 1, "CBI_author"); + end + // --------------------------------------- PII rules ------------------------------------------------------------------- -rule "12: Redacted PII Personal Identification Information" - when - Section(matchesType("PII")) - then - section.redact("PII", 12, "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - end +rule "13: Redacted PII Personal Identification Information" + when + Section(matchesType("PII")) + then + section.redact("PII", 13, "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end -rule "13: Redact contact information" +rule "14: Redact Emails by RegEx" + when + Section(searchText.contains("@")) + then + section.redactByRegEx("\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b", true, 0, "PII", 14, "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "15: Redact contact information" when Section(text.contains("Contact point:") || text.contains("Phone:") @@ -158,96 +178,96 @@ rule "13: Redact contact information" || text.contains("Telephone:") || text.contains("European contact:")) then - section.redactLineAfter("Contact point:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Phone:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel.:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Email:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("e-mail:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail address:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Contact:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Alternative contact:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone number:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone No:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax number:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("No:", "Fax", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("Contact:", "Tel.:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("European contact:", "PII", 13, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact point:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel.:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Email:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("e-mail:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail address:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Alternative contact:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone No:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("Contact:", "Tel.:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("European contact:", "PII", 15, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "14: Redact contact information if applicant is found" - when - Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:")) - then - section.redactLineAfter("Contact point:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Phone:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel.:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Email:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("e-mail:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail address:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Contact:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Alternative contact:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone number:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone No:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax number:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("No:", "Fax", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("Contact:", "Tel.:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("European contact:", "PII", 14, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - end +rule "16: Redact contact information if applicant is found" + when + Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Telephone number:")) + then + section.redactLineAfter("Contact point:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel.:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Email:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("e-mail:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail address:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Alternative contact:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone No:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("Contact:", "Tel.:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("European contact:", "PII", 16, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end -rule "15: Redact contact information if Producer is found" - when - Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance")) - then - section.redactLineAfter("Contact:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Phone:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Contact:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax number:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone number:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel:", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("No:", "Fax", "PII", 15, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - end +rule "17: Redact contact information if Producer is found" + when + Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance")) + then + section.redactLineAfter("Contact:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 17, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end -rule "16: Redact AUTHOR(S)" +rule "18: Redact AUTHOR(S)" when Section(searchText.contains("AUTHOR(S):")) then - section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 16, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 18, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "17: Redact PERFORMING LABORATORY" +rule "19: Redact PERFORMING LABORATORY" when Section(searchText.contains("PERFORMING LABORATORY:")) then - section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 17, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 19, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "18: Redact On behalf of Sequani Ltd.:" +rule "20: Redact On behalf of Sequani Ltd.:" when Section(searchText.contains("On behalf of Sequani Ltd.: Name Title")) then - section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 18, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 20, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "19: Redact On behalf of Syngenta Ltd.:" +rule "21: Redact On behalf of Syngenta Ltd.:" when Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title")) then - section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 19, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 21, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end \ No newline at end of file