Remove entries from must_redact dict, add test and refactor rules

This commit is contained in:
Thierry Göckel 2020-09-30 16:08:56 +02:00
parent cec5fd3d5e
commit 6a8f366519
5 changed files with 37 additions and 18 deletions

View File

@ -44,7 +44,7 @@ public class Section {
}
public boolean contains(String type) {
public boolean matchesType(String type) {
return entities.stream().anyMatch(entity -> entity.getType().equals(type));
}
@ -83,12 +83,10 @@ public class Section {
public void redactIfPrecededBy(String prefix, String type, int ruleNumber, String reason) {
entities.forEach(entity -> {
if (entity.getType().equals(type)) {
if (searchText.indexOf(prefix + entity.getWord()) != 1) {
entity.setRedaction(true);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
}
if (entity.getType().equals(type) && searchText.indexOf(prefix + entity.getWord()) != 1) {
entity.setRedaction(true);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
}
});
}

View File

@ -445,13 +445,35 @@ public class RedactionIntegrationTest {
RedactionResult result = redactionController.redact(request);
result.getRedactionLog().getRedactionLogEntry().forEach(entry -> {
if(!entry.isHint()){
if (!entry.isHint()) {
assertThat(entry.getReason()).isEqualTo("Not redacted because row is not a vertebrate study");
}
});
}
@Test
public void sponsorCompanyTest() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/sponsor_companies.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
request.setFlatRedaction(false);
RedactionResult result = redactionController.redact(request);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) {
fileOutputStream.write(result.getDocument());
}
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
System.out.println("numberOfPages: " + result.getNumberOfPages());
}
private static String loadFromClassPath(String path) {

View File

@ -1,3 +1,2 @@
Batches Produced at
CTL
determination of residues

View File

@ -7,7 +7,7 @@ global Section section
rule "1: Redacted because Section contains Vertebrate"
when
eval(section.contains("vertebrate")==true);
Section(matchesType("vertebrate"))
then
section.redact("name", 1, "Redacted because Section contains Vertebrate");
section.redact("address", 1, "Redacted because Section contains Vertebrate");
@ -16,7 +16,7 @@ rule "1: Redacted because Section contains Vertebrate"
rule "2: Not Redacted because Section contains no Vertebrate"
when
eval(section.contains("vertebrate")==false);
Section(matchesType("vertebrate"))
then
section.redactNot("name", 2, "Not Redacted because Section contains no Vertebrate");
section.redactNot("address", 2, "Not Redacted because Section contains no Vertebrate");
@ -25,7 +25,7 @@ rule "2: Not Redacted because Section contains no Vertebrate"
rule "3: Do not redact Names and Addresses if no redaction Indicator is contained"
when
eval(section.contains("vertebrate")==true && section.contains("no_redaction_indicator")==true);
Section(matchesType("vertebrate"), matchesType("no_redaction_indicator"))
then
section.redactNot("name", 3, "Vertebrate was found, but also a no redaction indicator");
section.redactNot("address", 3, "Vertebrate was found, but also a no redaction indicator");
@ -34,7 +34,7 @@ rule "3: Do not redact Names and Addresses if no redaction Indicator is containe
rule "4: Redact Names and Addresses if no_redaction_indicator and redaction_indicator is contained"
when
eval(section.contains("vertebrate")==true && section.contains("no_redaction_indicator")==true && section.contains("redaction_indicator")==true);
Section(matchesType("vertebrate"), matchesType("no_redaction_indicator"), matchesType("redaction_indicator"))
then
section.redact("name", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator");
section.redact("address", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator");
@ -43,7 +43,7 @@ rule "4: Redact Names and Addresses if no_redaction_indicator and redaction_indi
rule "5: Do not redact in guideline sections"
when
eval(section.headlineContainsWord("guideline") || section.headlineContainsWord("Guidance"));
Section(headlineContainsWord("guideline") || headlineContainsWord("Guidance"))
then
section.redactNot("name", 5, "Section is a guideline section.");
section.redactNot("address", 5, "Section is a guideline section.");
@ -52,7 +52,7 @@ rule "5: Do not redact in guideline sections"
rule "6: Redact contact information if applicant is found"
when
eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant") || section.headlineContainsWord("Primary contact") || section.headlineContainsWord("Alternative contact"));
Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact"))
then
section.redactLineAfter("Name:", "address", 6, "Applicant information was found");
section.redactBetween("Address:", "Contact", "address", 6, "Applicant information was found");
@ -79,7 +79,7 @@ rule "6: Redact contact information if applicant is found"
rule "7: Redact contact information if Producer is found"
when
eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance"));
Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance"))
then
section.redactLineAfter("Name:", "address", 7, "Producer was found");
section.redactBetween("Address:", "Contact", "address", 7, "Producer was found");
@ -110,7 +110,7 @@ rule "8: Not redacted because Vertebrate Study = N"
rule "9: Redact if must redact entry is found"
when
eval(section.contains("must_redact")==true);
Section(matchesType("must_redact"))
then
section.redact("name", 9, "must_redact entry was found.");
section.redact("address", 9, "must_redact entry was found.");
@ -128,7 +128,7 @@ rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate
rule "11: Redact sponsor company"
when
Section(section.getText().toLowerCase().contains("batches produced at"))
Section(text.toLowerCase().contains("batches produced at"))
then
section.redactIfPrecededBy("batches produced at", "sponsor", 11, "Redacted because it represents a sponsor company");
end