Integrate legal basis to rules, redaction annotation and tests

This commit is contained in:
Thierry Göckel 2020-10-26 10:07:41 +01:00
parent 04f0c29a49
commit f0f748db1b
6 changed files with 108 additions and 83 deletions

View File

@ -17,6 +17,7 @@ public class Entity {
private final String type;
private boolean redaction;
private String redactionReason;
private String legalBasis;
private List<EntityPositionSequence> positionSequences = new ArrayList<>();
private List<TextPositionSequence> targetSequences;
private Integer start;

View File

@ -56,13 +56,14 @@ public class Section {
}
public void redact(String type, int ruleNumber, String reason) {
public void redact(String type, int ruleNumber, String reason, String legalBasis) {
entities.forEach(entity -> {
if (entity.getType().equals(type)) {
entity.setRedaction(true);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
entity.setLegalBasis(legalBasis);
}
});
}
@ -80,19 +81,20 @@ public class Section {
}
public void redactIfPrecededBy(String prefix, String type, int ruleNumber, String reason) {
public void redactIfPrecededBy(String prefix, String type, int ruleNumber, String reason, String legalBasis) {
entities.forEach(entity -> {
if (entity.getType().equals(type) && searchText.indexOf(prefix + entity.getWord()) != 1) {
entity.setRedaction(true);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
entity.setLegalBasis(legalBasis);
}
});
}
public void redactLineAfter(String start, String asType, int ruleNumber, String reason) {
public void redactLineAfter(String start, String asType, int ruleNumber, String reason, String legalBasis) {
String[] values = StringUtils.substringsBetween(text, start, "\n");
@ -111,13 +113,14 @@ public class Section {
entity.setRedaction(true);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
entity.setLegalBasis(legalBasis);
}
});
}
public void redactBetween(String start, String stop, String asType, int ruleNumber, String reason) {
public void redactBetween(String start, String stop, String asType, int ruleNumber, String reason, String legalBasis) {
String[] values = StringUtils.substringsBetween(searchText, start, stop);
@ -136,6 +139,7 @@ public class Section {
entity.setRedaction(true);
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
entity.setLegalBasis(legalBasis);
}
});
}
@ -151,10 +155,15 @@ public class Section {
startIndex = searchText.indexOf(value, stopIndex);
stopIndex = startIndex + value.length();
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(searchText.charAt(startIndex - 1)) || isSeparator(searchText
.charAt(startIndex - 1))) && (stopIndex == searchText.length() || isSeparator(searchText.charAt(stopIndex)))) {
found.add(new Entity(searchText.substring(startIndex, stopIndex), asType, startIndex, stopIndex,
headline, sectionNumber));
if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(searchText.charAt(startIndex - 1)) || isSeparator(
searchText.charAt(startIndex - 1))) && (stopIndex == searchText.length() || isSeparator(searchText.charAt(
stopIndex)))) {
found.add(new Entity(searchText.substring(startIndex, stopIndex),
asType,
startIndex,
stopIndex,
headline,
sectionNumber));
}
} while (startIndex > -1);
@ -164,7 +173,8 @@ public class Section {
private boolean isSeparator(char c) {
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}", String.valueOf(c)) || c == '\"' || c == '' || c == '';
return Character.isWhitespace(c) || Pattern.matches("\\p{Punct}",
String.valueOf(c)) || c == '\"' || c == '' || c == '';
}
@ -186,23 +196,23 @@ public class Section {
public void highlightCell(String cellHeader, int ruleNumber, String type) {
annotateCell(cellHeader, ruleNumber, type, false, null);
annotateCell(cellHeader, ruleNumber, type, false, null, null);
}
public void redactCell(String cellHeader, int ruleNumber, String type, String reason) {
public void redactCell(String cellHeader, int ruleNumber, String type, String reason, String legalBasis) {
annotateCell(cellHeader, ruleNumber, type, true, reason);
annotateCell(cellHeader, ruleNumber, type, true, reason, legalBasis);
}
public void redactNotCell(String cellHeader, int ruleNumber, String type, String reason) {
annotateCell(cellHeader, ruleNumber, type, false, reason);
annotateCell(cellHeader, ruleNumber, type, false, reason, null);
}
private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason) {
private void annotateCell(String cellHeader, int ruleNumber, String type, boolean redact, String reason, String legalBasis) {
String cleanHeaderName = cellHeader.replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
@ -221,7 +231,8 @@ public class Section {
entity.setMatchedRule(ruleNumber);
entity.setRedactionReason(reason);
entity.setTargetSequences(value.getTextBlock()
.getSequences()); // Make sure no other cells with same content are highlighted
.getSequences()); // Make sure no other cells with same content are highlighted
entity.setLegalBasis(legalBasis);
// HashSet keeps the older value, but we want the new only.
entities.remove(entity);

View File

@ -22,9 +22,9 @@ import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@Slf4j
public class DictionaryService {
private final DictionaryClient dictionaryClient;

View File

@ -133,13 +133,13 @@ public class AnnotationHighlightService {
}
if (CollectionUtils.isNotEmpty(entityPositionSequence.getSequences())) {
List<Rectangle> rectanglesPerline = getRectanglesPerLine(entityPositionSequence.getSequences()
List<Rectangle> rectanglesPerLine = getRectanglesPerLine(entityPositionSequence.getSequences()
.stream()
.flatMap(seq -> seq.getTextPositions().stream())
.collect(Collectors.toList()), page);
redactionLogEntry.getPositions().addAll(rectanglesPerline);
annotations.addAll(createAnnotation(rectanglesPerline, prefixId(entity, entityPositionSequence.getId(), requestedToRemove, removeFromDictionary), createAnnotationContent(entity), getColor(entity, requestedToRemove), comments, !isHint(entity)));
redactionLogEntry.getPositions().addAll(rectanglesPerLine);
annotations.addAll(createAnnotation(rectanglesPerLine, prefixId(entity, entityPositionSequence.getId(), requestedToRemove, removeFromDictionary), createAnnotationContent(entity), getColor(entity, requestedToRemove), comments, !isHint(entity)));
}
redactionLogEntry.setId(entityPositionSequence.getId());
@ -238,7 +238,7 @@ public class AnnotationHighlightService {
if (manualRedactionEntry.isAddToDictionary()) {
return "request:add:" + manualRedactionEntry.getType() + ":" + id;
}
return "request:add:only_here" + ":" + id;
return "request:add:only_here:" + id;
}
return "ignore:" + manualRedactionEntry.getType() + ":" + id;
@ -316,14 +316,14 @@ public class AnnotationHighlightService {
private String createAnnotationContent(Entity entity) {
return "\nRule " + entity.getMatchedRule() + " matched" + "\n\n" + entity.getRedactionReason() + "\n\nIn " + "Section : \"" + entity
.getHeadline() + "\"";
return "\nRule " + entity.getMatchedRule() + " matched\n\n" + entity.getRedactionReason() + "\n\nLegal basis:"
+ entity.getLegalBasis() + "\n\nIn section: \"" + entity.getHeadline() + "\"";
}
private String createAnnotationContent(ManualRedactionEntry entry) {
return "\nManual Redaction" + "\n\nIn Section : \"" + entry.getSection() + "\"";
return "\nManual Redaction\n\nIn Section : \"" + entry.getSection() + "\"";
}

View File

@ -238,21 +238,32 @@ public class EntityRedactionServiceTest {
" when\n" +
" eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" +
" then\n" +
" section.redactLineAfter(\"Name:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactBetween(\"Address:\", \"Contact\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Contact point:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Phone:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Fax:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Tel:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"E-mail:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Email:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Contact:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Telephone number:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Fax number:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Telephone:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactBetween(\"No:\", \"Fax\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Name:\", \"address\", 6, \"Applicant information was found\", \"Reg" +
" (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"Address:\", \"Contact\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Contact point:\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Phone:\", \"address\", 6, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Fax:\", \"address\", 6, \"Applicant information was found\", \"Reg " +
"(EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Tel.:\", \"address\", 6, \"Applicant information was found\", \"Reg" +
" (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Tel:\", \"address\", 6, \"Applicant information was found\", \"Reg " +
"(EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"E-mail:\", \"address\", 6, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Email:\", \"address\", 6, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Contact:\", \"address\", 6, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Telephone number:\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Fax number:\", \"address\", 6, \"Applicant information was found\"," +
" \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactLineAfter(\"Telephone:\", \"address\", 6, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"No:\", \"Fax\", \"address\", 6, \"Applicant information was found\", " +
"\"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6, \"Applicant information was found\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" end";
when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules));
@ -372,8 +383,9 @@ public class EntityRedactionServiceTest {
" Section(rowEquals(\"Vertebrate study Y/N\", \"Y\") || rowEquals(\"Vertebrate study Y/N\", " +
"\"Yes\"))\n" +
" then\n" +
" section.redactCell(\"Author(s)\", 9, \"name\", \"Redacted because row is a vertebrate study\");\n" +
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" +
" section.redactCell(\"Author(s)\", 9, \"name\", \"Redacted because row is a vertebrate study\", \"Reg (EC) No 1107/2009 Art. 63 (2g)\");\n" +
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\", \"Reg (EC) No" +
" 1107/2009 Art. 63 (2g)\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
" end";
when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet());

View File

@ -9,8 +9,8 @@ rule "1: Redacted because Section contains Vertebrate"
when
Section(matchesType("vertebrate"))
then
section.redact("name", 1, "Redacted because Section contains Vertebrate");
section.redact("address", 1, "Redacted because Section contains Vertebrate");
section.redact("name", 1, "Redacted because Section contains Vertebrate", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redact("address", 1, "Redacted because Section contains Vertebrate", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end
@ -36,8 +36,8 @@ rule "4: Redact Names and Addresses if no_redaction_indicator and redaction_indi
when
Section(matchesType("vertebrate"), matchesType("no_redaction_indicator"), matchesType("redaction_indicator"))
then
section.redact("name", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator");
section.redact("address", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator");
section.redact("name", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redact("address", 4, "Vertebrate was found and no_redaction_indicator and redaction_indicator", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end
@ -54,26 +54,26 @@ rule "6: Redact contact information if applicant is found"
when
Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact"))
then
section.redactLineAfter("Name:", "address", 6, "Applicant information was found");
section.redactBetween("Address:", "Contact", "address", 6, "Applicant information was found");
section.redactLineAfter("Contact point:", "address", 6, "Applicant information was found");
section.redactLineAfter("Phone:", "address", 6, "Applicant information was found");
section.redactLineAfter("Fax:", "address", 6, "Applicant information was found");
section.redactLineAfter("Tel.:", "address", 6, "Applicant information was found");
section.redactLineAfter("Tel:", "address", 6, "Applicant information was found");
section.redactLineAfter("E-mail:", "address", 6, "Applicant information was found");
section.redactLineAfter("Email:", "address", 6, "Applicant information was found");
section.redactLineAfter("e-mail:", "address", 6, "Applicant information was found");
section.redactLineAfter("E-mail address:", "address", 6, "Applicant information was found");
section.redactLineAfter("Contact:", "address", 6, "Applicant information was found");
section.redactLineAfter("Alternative contact:", "address", 6, "Applicant information was found");
section.redactLineAfter("Telephone number:", "address", 6, "Applicant information was found");
section.redactLineAfter("Telephone No:", "address", 6, "Applicant information was found");
section.redactLineAfter("Fax number:", "address", 6, "Applicant information was found");
section.redactLineAfter("Telephone:", "address", 6, "Applicant information was found");
section.redactLineAfter("Company:", "address", 6, "Applicant information was found");
section.redactBetween("No:", "Fax", "address", 6, "Applicant information was found");
section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found");
section.redactLineAfter("Name:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("Address:", "Contact", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Contact point:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Phone:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Fax:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Tel.:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Tel:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("E-mail:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Email:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("e-mail:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("E-mail address:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Contact:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Alternative contact:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Telephone number:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Telephone No:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Fax number:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Telephone:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Company:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("No:", "Fax", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end
@ -81,20 +81,20 @@ rule "7: Redact contact information if Producer is found"
when
Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance"))
then
section.redactLineAfter("Name:", "address", 7, "Producer was found");
section.redactBetween("Address:", "Contact", "address", 7, "Producer was found");
section.redactBetween("Contact:", "Phone", "address", 7, "Producer was found");
section.redactBetween("Contact:", "Telephone number:", "address", 7, "Producer was found");
section.redactBetween("Address:", "Manufacturing", "address", 7, "Producer was found");
section.redactLineAfter("Telephone:", "address", 7, "Producer was found");
section.redactLineAfter("Phone:", "address", 7, "Producer was found");
section.redactLineAfter("Fax:", "address", 7, "Producer was found");
section.redactLineAfter("E-mail:", "address", 7, "Producer was found");
section.redactLineAfter("Contact:", "address", 7, "Producer was found");
section.redactLineAfter("Fax number:", "address", 7, "Producer was found");
section.redactLineAfter("Telephone number:", "address", 7, "Producer was found");
section.redactLineAfter("Tel:", "address", 7, "Producer was found");
section.redactBetween("No:", "Fax", "address", 7, "Producer was found");
section.redactLineAfter("Name:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("Address:", "Contact", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("Contact:", "Phone", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("Contact:", "Telephone number:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("Address:", "Manufacturing", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Telephone:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Phone:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Fax:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("E-mail:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Contact:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Fax number:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Telephone number:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactLineAfter("Tel:", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redactBetween("No:", "Fax", "address", 7, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end
@ -112,8 +112,8 @@ rule "9: Redact if must redact entry is found"
when
Section(matchesType("must_redact"))
then
section.redact("name", 9, "must_redact entry was found.");
section.redact("address", 9, "must_redact entry was found.");
section.redact("name", 9, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redact("address", 9, "must_redact entry was found.", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end
@ -121,14 +121,15 @@ rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate
when
Section(rowEquals("Vertebrate study Y/N", "Y") || rowEquals("Vertebrate study Y/N", "Yes"))
then
section.redactCell("Author(s)", 10, "name", "Redacted because row is a vertebrate study");
section.redact("address", 10, "Redacted because row is a vertebrate study");
section.redactCell("Author(s)", 10, "name", "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.redact("address", 10, "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)");
section.highlightCell("Vertebrate study Y/N", 10, "must_redact");
end
rule "11: Redact sponsor company"
when
Section(text.toLowerCase().contains("batches produced at"))
then
section.redactIfPrecededBy("batches produced at", "sponsor", 11, "Redacted because it represents a sponsor company");
section.redactIfPrecededBy("batches produced at", "sponsor", 11, "Redacted because it represents a sponsor company", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end