diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index d519bf06..640a4f4b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -9,6 +9,8 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; +import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; + import lombok.Builder; import lombok.Data; import lombok.extern.slf4j.Slf4j; @@ -18,6 +20,8 @@ import lombok.extern.slf4j.Slf4j; @Builder public class Section { + private DictionaryService dictionaryService; + private Set entities; // This still contains linebreaks etc. @@ -94,13 +98,15 @@ public class Section { } - public void addHintAnnotation(String value, String asType){ + public void addHintAnnotation(String value, String asType) { + Set found = findEntities(value.trim(), asType, true); entities.addAll(found); } - public void redactLineAfter(String start, String asType, int ruleNumber, String reason, String legalBasis) { + public void redactLineAfter(String start, String asType, int ruleNumber, boolean redactEverywhere, String reason, + String legalBasis) { String[] values = StringUtils.substringsBetween(text, start, "\n"); @@ -108,6 +114,8 @@ public class Section { for (String value : values) { if (StringUtils.isNotBlank(value)) { Set found = findEntities(value.trim(), asType, false); + // HashSet keeps the older value, but we want the new only. + entities.removeAll(found); entities.addAll(found); } } @@ -126,7 +134,8 @@ public class Section { } - public void redactBetween(String start, String stop, String asType, int ruleNumber, String reason, String legalBasis) { + public void redactBetween(String start, String stop, String asType, int ruleNumber, boolean redactEverywhere, + String reason, String legalBasis) { String[] values = StringUtils.substringsBetween(searchText, start, stop); @@ -134,7 +143,47 @@ public class Section { for (String value : values) { if (StringUtils.isNotBlank(value)) { Set found = findEntities(value.trim(), asType, false); + // HashSet keeps the older value, but we want the new only. + entities.removeAll(found); entities.addAll(found); + if (redactEverywhere) { + dictionaryService.addToLocalDictionary(asType, value.trim()); + } + } + } + } + + // TODO No need to iterate + entities.forEach(entity -> { + if (entity.getType().equals(asType)) { + entity.setRedaction(true); + entity.setMatchedRule(ruleNumber); + entity.setRedactionReason(reason); + entity.setLegalBasis(legalBasis); + } + }); + } + + + public void redactLinesBetween(String start, String stop, String asType, int ruleNumber, boolean redactEverywhere, + String reason, String legalBasis) { + + String[] values = StringUtils.substringsBetween(text, start, stop); + + if (values != null) { + for (String value : values) { + if (StringUtils.isNotBlank(value)) { + String[] lines = value.split("\n"); + for (String line : lines) { + Set found = findEntities(line.trim(), asType, false); + + // HashSet keeps the older value, but we want the new only. + entities.removeAll(found); + entities.addAll(found); + if (redactEverywhere) { + dictionaryService.addToLocalDictionary(asType, line.trim()); + } + } } } } @@ -158,22 +207,15 @@ public class Section { String text = caseinsensitive ? searchText.toLowerCase() : searchText; String searchValue = caseinsensitive ? value.toLowerCase() : value; - int startIndex; int stopIndex = 0; do { startIndex = text.indexOf(searchValue, stopIndex); stopIndex = startIndex + searchValue.length(); - if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(text.charAt(startIndex - 1)) || isSeparator( - text.charAt(startIndex - 1))) && (stopIndex == text.length() || isSeparator(text.charAt( - stopIndex)))) { - found.add(new Entity(searchText.substring(startIndex, stopIndex), - asType, - startIndex, - stopIndex, - headline, - sectionNumber)); + if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(text.charAt(startIndex - 1)) || isSeparator(text + .charAt(startIndex - 1))) && (stopIndex == text.length() || isSeparator(text.charAt(stopIndex)))) { + found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex, headline, sectionNumber)); } } while (startIndex > -1); @@ -183,8 +225,7 @@ public class Section { private boolean isSeparator(char c) { - return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", - String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; + return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '‘' || c == '’'; } @@ -222,7 +263,8 @@ public class Section { } - private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason, String legalBasis) { + private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason, + String legalBasis) { String cleanHeaderName = cellHeader.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", ""); @@ -231,17 +273,12 @@ public class Section { log.warn("Could not find any data for {}.", cellHeader); } else { String word = value.toString(); - Entity entity = new Entity(word, - type, - value.getRowSpanStart(), - value.getRowSpanStart() + word.length(), - headline, - sectionNumber); + Entity entity = new Entity(word, type, value.getRowSpanStart(), value.getRowSpanStart() + word.length(), headline, sectionNumber); entity.setRedaction(redact); entity.setMatchedRule(ruleNumber); entity.setRedactionReason(reason); entity.setTargetSequences(value.getTextBlock() - .getSequences()); // Make sure no other cells with same content are highlighted + .getSequences()); // Make sure no other cells with same content are highlighted entity.setLegalBasis(legalBasis); // HashSet keeps the older value, but we want the new only. diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java index 4191ca55..62c539a0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java @@ -2,11 +2,13 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; import java.awt.Color; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.stream.Collectors; import org.apache.commons.collections4.CollectionUtils; @@ -33,7 +35,10 @@ public class DictionaryService { private long dictionaryVersion = -1; @Getter - private Map> dictionary = new HashMap<>(); + private Map> dictionary = new TreeMap<>(Comparator.reverseOrder()); // Using TreeMap, because order of keys is important. + + @Getter + private Map> localDictionary = new TreeMap<>(Comparator.reverseOrder()); // Using TreeMap, because order of keys is important. @Getter private Map entryColors = new HashMap<>(); @@ -57,6 +62,18 @@ public class DictionaryService { private float[] notRedactedColor; + public void addToLocalDictionary(String type, String value) { + + localDictionary.computeIfAbsent(type, (x) -> new HashSet<>()).add(value); + } + + + public void clearLocalDictionary() { + + localDictionary = new TreeMap<>(Comparator.reverseOrder()); + } + + public void updateDictionary() { long version = dictionaryClient.getVersion(); @@ -85,10 +102,11 @@ public class DictionaryService { .filter(TypeResult::isCaseInsensitive) .map(TypeResult::getType) .collect(Collectors.toList()); - dictionary = entryColors.keySet() - .stream() - .collect(Collectors.toMap(type -> type, this::convertEntries)); + dictionary = new TreeMap<>(Comparator.reverseOrder()); + entryColors.keySet().forEach(type -> { + dictionary.put(type, convertEntries(type)); + }); Colors colors = dictionaryClient.getColors(); defaultColor = convertColor(colors.getDefaultColor()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index e16a7bd6..6fca0281 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -40,7 +40,37 @@ public class EntityRedactionService { dictionaryService.updateDictionary(); droolsExecutionService.updateRules(); + dictionaryService.clearLocalDictionary(); + Set documentEntities = new HashSet<>(); + documentEntities.addAll(findEntities(classifiedDoc, manualRedactions, dictionaryService.getDictionary())); + + if(!dictionaryService.getLocalDictionary().isEmpty()){ + Set foundByLocal = findEntities(classifiedDoc, manualRedactions, dictionaryService.getLocalDictionary()); + // HashSet keeps the older value, but we want the new only. + documentEntities.removeAll(foundByLocal); + documentEntities.addAll(foundByLocal); + } + + for (Entity entity : documentEntities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); + } + + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + classifiedDoc.getEntities() + .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity.getLegalBasis())); + } + } + + } + + + private Set findEntities(Document classifiedDoc, ManualRedactions manualRedactions, Map> dictionary){ Set documentEntities = new HashSet<>(); int sectionNumber = 1; for (Paragraph paragraph : classifiedDoc.getParagraphs()) { @@ -75,9 +105,10 @@ public class EntityRedactionService { searchableRow.addAll(textBlock.getSequences()); } } - Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber); + Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary); Section analysedRowSection = droolsExecutionService.executeRules(Section.builder() + .dictionaryService(dictionaryService) .entities(rowEntities) .text(searchableRow.getAsStringWithLinebreaks()) .searchText(searchableRow.toString()) @@ -93,8 +124,9 @@ public class EntityRedactionService { } addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber); - Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber); + Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary); Section analysedSection = droolsExecutionService.executeRules(Section.builder() + .dictionaryService(dictionaryService) .entities(entities) .text(searchableText.getAsStringWithLinebreaks()) .searchText(searchableText.toString()) @@ -105,22 +137,7 @@ public class EntityRedactionService { documentEntities.addAll(clearAndFindPositions(analysedSection.getEntities(), searchableText)); sectionNumber++; } - - for (Entity entity : documentEntities) { - Map> sequenceOnPage = new HashMap<>(); - for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { - sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) - .add(entityPositionSequence); - } - - for (Map.Entry> entry : sequenceOnPage.entrySet()) { - classifiedDoc.getEntities() - .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry - .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity.getLegalBasis())); - } - } - + return documentEntities; } @@ -140,7 +157,7 @@ public class EntityRedactionService { } - private Set findEntities(SearchableText searchableText, String headline, int sectionNumber) { + private Set findEntities(SearchableText searchableText, String headline, int sectionNumber, Map> dictionary) { Set found = new HashSet<>(); String searchableString = searchableText.toString(); @@ -149,7 +166,7 @@ public class EntityRedactionService { } String lowercaseInputString = searchableString.toLowerCase(); - for (Map.Entry> entry : dictionaryService.getDictionary().entrySet()) { + for (Map.Entry> entry : dictionary.entrySet()) { if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) { found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline, sectionNumber)); } else { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 4d48e641..69ddf7fb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -67,8 +67,8 @@ public class RedactionIntegrationTest { private static final String RULES = loadFromClassPath("drools/rules.drl"); private static final String VERTEBRATE = "vertebrate"; - private static final String ADDRESS = "address"; - private static final String AUTHOR = "author"; + private static final String ADDRESS = "CBI_address"; + private static final String AUTHOR = "CBI_author"; private static final String SPONSOR = "sponsor"; private static final String NO_REDACTION_INDICATOR = "no_redaction_indicator"; private static final String REDACTION_INDICATOR = "redaction_indicator"; @@ -77,6 +77,8 @@ public class RedactionIntegrationTest { private static final String PUBLISHED_INFORMATION = "published_information"; private static final String TEST_METHOD = "test_method"; + private static final String PII = "PII"; + @Autowired private RedactionController redactionController; @@ -134,6 +136,7 @@ public class RedactionIntegrationTest { when(dictionaryClient.getDictionaryForType(MUST_REDACT)).thenReturn(getDictionaryResponse(MUST_REDACT)); when(dictionaryClient.getDictionaryForType(PUBLISHED_INFORMATION)).thenReturn(getDictionaryResponse(PUBLISHED_INFORMATION)); when(dictionaryClient.getDictionaryForType(TEST_METHOD)).thenReturn(getDictionaryResponse(TEST_METHOD)); + when(dictionaryClient.getDictionaryForType(PII)).thenReturn(getDictionaryResponse(PII)); when(dictionaryClient.getColors()).thenReturn(colors); } @@ -141,7 +144,7 @@ public class RedactionIntegrationTest { private void loadDictionaryForTest() { dictionary.computeIfAbsent(AUTHOR, v -> new ArrayList<>()) - .addAll(ResourceLoader.load("dictionaries/author.txt") + .addAll(ResourceLoader.load("dictionaries/CBI_author.txt") .stream() .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); @@ -156,7 +159,7 @@ public class RedactionIntegrationTest { .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); dictionary.computeIfAbsent(ADDRESS, v -> new ArrayList<>()) - .addAll(ResourceLoader.load("dictionaries/address.txt") + .addAll(ResourceLoader.load("dictionaries/CBI_address.txt") .stream() .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); @@ -190,6 +193,11 @@ public class RedactionIntegrationTest { .stream() .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); + dictionary.computeIfAbsent(PII, v -> new ArrayList<>()) + .addAll(ResourceLoader.load("dictionaries/PII.txt") + .stream() + .map(this::cleanDictionaryEntry) + .collect(Collectors.toSet())); } @@ -211,6 +219,7 @@ public class RedactionIntegrationTest { typeColorMap.put(MUST_REDACT, "#fab4c0"); typeColorMap.put(PUBLISHED_INFORMATION, "#85ebff"); typeColorMap.put(TEST_METHOD, "#91fae8"); + typeColorMap.put(PII, "#66ccff"); hintTypeMap.put(VERTEBRATE, true); @@ -223,6 +232,7 @@ public class RedactionIntegrationTest { hintTypeMap.put(MUST_REDACT, true); hintTypeMap.put(PUBLISHED_INFORMATION, true); hintTypeMap.put(TEST_METHOD, true); + hintTypeMap.put(PII, false); caseInSensitiveMap.put(VERTEBRATE, true); caseInSensitiveMap.put(ADDRESS, false); @@ -234,6 +244,7 @@ public class RedactionIntegrationTest { caseInSensitiveMap.put(MUST_REDACT, true); caseInSensitiveMap.put(PUBLISHED_INFORMATION, true); caseInSensitiveMap.put(TEST_METHOD, false); + caseInSensitiveMap.put(PII, false); colors.setDefaultColor("#acfc00"); colors.setNotRedacted("#cccccc"); @@ -322,7 +333,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index d8c19c38..03fd0be9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -185,11 +185,11 @@ public class EntityRedactionServiceTest { " Supplement - Identity of the active substance - Reference list.pdf"); when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/author.txt"))) + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))) .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/address.txt"))) + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() @@ -222,11 +222,11 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf"); when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/author.txt"))) + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))) .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/address.txt"))) + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() @@ -257,32 +257,32 @@ public class EntityRedactionServiceTest { " when\n" + " eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" + " then\n" + - " section.redactLineAfter(\"Name:\", \"address\", 6, \"Applicant information was found\", \"Reg" + + " section.redactLineAfter(\"Name:\", \"address\", 6,true, \"Applicant information was found\", \"Reg" + " (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactBetween(\"Address:\", \"Contact\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Contact point:\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Phone:\", \"address\", 6, \"Applicant information was found\", " + + " section.redactBetween(\"Address:\", \"Contact\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + + " section.redactLineAfter(\"Contact point:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + + " section.redactLineAfter(\"Phone:\", \"address\", 6,true, \"Applicant information was found\", " + "\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Fax:\", \"address\", 6, \"Applicant information was found\", \"Reg " + + " section.redactLineAfter(\"Fax:\", \"address\", 6,true, \"Applicant information was found\", \"Reg " + "(EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Tel.:\", \"address\", 6, \"Applicant information was found\", \"Reg" + + " section.redactLineAfter(\"Tel.:\", \"address\", 6,true, \"Applicant information was found\", \"Reg" + " (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Tel:\", \"address\", 6, \"Applicant information was found\", \"Reg " + + " section.redactLineAfter(\"Tel:\", \"address\", 6,true, \"Applicant information was found\", \"Reg " + "(EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"E-mail:\", \"address\", 6, \"Applicant information was found\", " + + " section.redactLineAfter(\"E-mail:\", \"address\", 6,true, \"Applicant information was found\", " + "\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Email:\", \"address\", 6, \"Applicant information was found\", " + + " section.redactLineAfter(\"Email:\", \"address\", 6,true, \"Applicant information was found\", " + "\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Contact:\", \"address\", 6, \"Applicant information was found\", " + + " section.redactLineAfter(\"Contact:\", \"address\", 6,true, \"Applicant information was found\", " + "\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Telephone number:\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Fax number:\", \"address\", 6, \"Applicant information was found\"," + + " section.redactLineAfter(\"Telephone number:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + + " section.redactLineAfter(\"Fax number:\", \"address\", 6,true, \"Applicant information was found\"," + " \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactLineAfter(\"Telephone:\", \"address\", 6, \"Applicant information was found\", " + + " section.redactLineAfter(\"Telephone:\", \"address\", 6,true, \"Applicant information was found\", " + "\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactBetween(\"No:\", \"Fax\", \"address\", 6, \"Applicant information was found\", " + + " section.redactBetween(\"No:\", \"Fax\", \"address\", 6,true, \"Applicant information was found\", " + "\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + - " section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + + " section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6,true, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" + " end"; when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet()); when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules)); @@ -291,11 +291,11 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf"); when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/author.txt"))) + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))) .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/address.txt"))) + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/address.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt similarity index 100% rename from redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/address.txt rename to redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt similarity index 100% rename from redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/author.txt rename to redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/PII.txt new file mode 100644 index 00000000..e69de29b diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 6220d7d9..c8908bd7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -5,12 +5,14 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Section global Section section +// --------------------------------------- CBI rules ------------------------------------------------------------------- + rule "1: Redacted because Section contains Vertebrate" when Section(matchesType("vertebrate")) then - section.redact("author", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("address", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_author", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 1, "Vertebrate found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); end @@ -18,8 +20,8 @@ rule "2: Not Redacted because Section contains no Vertebrate" when Section(!matchesType("vertebrate")) then - section.redactNot("author", 2, "No Vertebrate found"); - section.redactNot("address", 2, "No Vertebrate found"); + section.redactNot("CBI_author", 2, "No Vertebrate found"); + section.redactNot("CBI_address", 2, "No Vertebrate found"); end @@ -27,8 +29,8 @@ rule "3: Do not redact Names and Addresses if no redaction Indicator is containe when Section(matchesType("vertebrate"), matchesType("no_redaction_indicator")) then - section.redactNot("author", 3, "Vertebrate and No Redaction Indicator found"); - section.redactNot("address", 3, "Vertebrate and No Redaction Indicator found"); + section.redactNot("CBI_author", 3, "Vertebrate and No Redaction Indicator found"); + section.redactNot("CBI_address", 3, "Vertebrate and No Redaction Indicator found"); end @@ -36,8 +38,8 @@ rule "4: Do not redact Names and Addresses if no redaction Indicator is containe when Section(matchesType("vertebrate"), matchesType("published_information")) then - section.redactNot("author", 4, "Vertebrate and Published Information found"); - section.redactNot("address", 4, "Vertebrate and Published Information found"); + section.redactNot("CBI_author", 4, "Vertebrate and Published Information found"); + section.redactNot("CBI_address", 4, "Vertebrate and Published Information found"); end @@ -45,87 +47,153 @@ rule "5: Redact Names and Addresses if no_redaction_indicator and redaction_indi when Section(matchesType("vertebrate"), matchesType("no_redaction_indicator"), matchesType("redaction_indicator")) then - section.redact("author", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("address", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_author", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 5, "Vertebrate and Redaction Indicator found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); end -rule "6: Redact contact information if applicant is found" - when - Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:")) - then - section.redactLineAfter("Contact point:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Phone:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Fax:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Tel.:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Tel:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("E-mail:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Email:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("e-mail:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("E-mail address:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Contact:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Alternative contact:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Telephone number:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Telephone No:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Fax number:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Telephone:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Company:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactBetween("No:", "Fax", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("European contact:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - end - - -rule "7: Redact contact information if Producer is found" - when - Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance")) - then - section.redactLineAfter("Contact:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Telephone:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Phone:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Fax:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("E-mail:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Contact:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Fax number:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Telephone number:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactLineAfter("Tel:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redactBetween("No:", "Fax", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - end - - -rule "8: Not redacted because Vertebrate Study = N" +rule "6: Not redacted because Vertebrate Study = N" when Section(rowEquals("Vertebrate study Y/N", "N") || rowEquals("Vertebrate study Y/N", "No")) then - section.redactNotCell("Author(s)", 8, "author", "Not redacted because row is not a vertebrate study"); - section.redactNot("address", 8, "Not redacted because row is not a vertebrate study"); - section.highlightCell("Vertebrate study Y/N", 8, "hint_only"); + section.redactNotCell("Author(s)", 6, "CBI_author", "Not redacted because row is not a vertebrate study"); + section.redactNot("CBI_address", 6, "Not redacted because row is not a vertebrate study"); + section.highlightCell("Vertebrate study Y/N", 6, "hint_only"); end -rule "9: Redact if must redact entry is found" +rule "7: Redact if must redact entry is found" when Section(matchesType("must_redact")) then - section.redact("author", 9, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("address", 9, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_author", 7, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 7, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); end -rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate study" +rule "8: Redact Authors and Addresses in Reference Table if it is a Vertebrate study" when Section(rowEquals("Vertebrate study Y/N", "Y") || rowEquals("Vertebrate study Y/N", "Yes")) then - section.redactCell("Author(s)", 10, "author", "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.redact("address", 10, "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)"); - section.highlightCell("Vertebrate study Y/N", 10, "must_redact"); + section.redactCell("Author(s)", 8, "CBI_author", "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 8, "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.highlightCell("Vertebrate study Y/N", 8, "must_redact"); end -rule "11: Redact sponsor company" +rule "9: Redact sponsor company" when Section(searchText.toLowerCase().contains("batches produced at")) then - section.redactIfPrecededBy("batches produced at", "sponsor", 11, "Redacted because it represents a sponsor company", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redactIfPrecededBy("batches produced at", "sponsor", 9, "Redacted because it represents a sponsor company", "Reg (EC) No 1107/2009 Art. 63 (2g)"); section.addHintAnnotation("batches produced at", "must_redact"); + end + + +rule "10: Redact determination of residues" + when + Section(searchText.toLowerCase.contains("determination of residues") && ( + searchText.toLowerCase.contains("livestock") || + searchText.toLowerCase.contains("live stock") || + searchText.toLowerCase.contains("egg") || + searchText.toLowerCase.contains("milk") || + searchText.toLowerCase.contains("bovine") || + searchText.toLowerCase.contains("ruminant") + )) + then + section.redact("CBI_author", 10, "Determination of residues was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.redact("CBI_address", 10, "Determination of residues was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + section.addHintAnnotation("determination of residues", "must_redact"); + section.addHintAnnotation("livestock", "must_redact"); + section.addHintAnnotation("live stock", "must_redact"); + section.addHintAnnotation("egg", "must_redact"); + section.addHintAnnotation("milk", "must_redact"); + section.addHintAnnotation("bovine", "must_redact"); + section.addHintAnnotation("ruminant", "must_redact"); + end + + +// --------------------------------------- PII rules ------------------------------------------------------------------- + + +rule "11: Redacted PII Personal Identification Information" + when + Section(matchesType("PII")) + then + section.redact("PII", 11, "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "12: Redact contact information if applicant is found" + when + Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:")) + then + section.redactLineAfter("Contact point:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Email:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("e-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail address:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Alternative contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone No:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Company:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("European contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "13: Redact contact information if Producer is found" + when + Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance")) + then + section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "14: Redact AUTHOR(S)" + when + Section(searchText.contains("AUTHOR(S):")) + then + section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 14, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "15: Redact PERFORMING LABORATORY" + when + Section(searchText.contains("PERFORMING LABORATORY:")) + then + section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 15, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "16: Redact On behalf of Sequani Ltd.:" + when + Section(searchText.contains("On behalf of Sequani Ltd.: Name Title")) + then + section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 16, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "17: Redact On behalf of Syngenta Ltd.:" + when + Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title")) + then + section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/Single Study - Oral (Gavage) Mouse.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/Single Study - Oral (Gavage) Mouse.pdf new file mode 100644 index 00000000..be0666aa Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/Single Study - Oral (Gavage) Mouse.pdf differ