Pull request #35: Fix redaction in single cell tables

Merge in RED/redaction-service from RED-279 to master

* commit '3d5455d7297e5805ad30a578d00d6d1737820c7d':
  Fix redaction in single cell tables
This commit is contained in:
Thierry Goeckel 2020-09-02 14:39:08 +02:00
commit 34e058c4e4
4 changed files with 65 additions and 6 deletions

View File

@ -52,12 +52,13 @@ public class EntityRedactionService {
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
boolean singleCellTable = table.getRowCount() == 1 && table.getColCount() == 1;
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
for (Cell cell : row) {
if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
if (!singleCellTable && cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);

View File

@ -57,6 +57,7 @@ public class EntityRedactionServiceTest {
private static final String ADDRESS_CODE = "address";
private static final AtomicLong DICTIONARY_VERSION = new AtomicLong();
private static final AtomicLong RULES_VERSION = new AtomicLong();
@MockBean
private DictionaryClient dictionaryClient;
@ -69,6 +70,9 @@ public class EntityRedactionServiceTest {
@Autowired
private PdfSegmentationService pdfSegmentationService;
@Autowired
private DroolsExecutionService droolsExecutionService;
@TestConfiguration
public static class RedactionIntegrationTestConfiguration {
@ -185,7 +189,7 @@ public class EntityRedactionServiceTest {
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // two pages
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
@ -194,6 +198,60 @@ public class EntityRedactionServiceTest {
}
@Test
public void testApplicantInTableRedaction() throws IOException {
String tableRules = "package drools\n" +
"\n" +
"import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" +
"\n" +
"global Section section\n" +
"rule \"6: Redact contact information if applicant is found\"\n" +
" when\n" +
" eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" +
" then\n" +
" section.redactLineAfter(\"Name:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactBetween(\"Address:\", \"Contact\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Contact point:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Phone:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Fax:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Tel:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"E-mail:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Email:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Contact:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Telephone number:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Fax number:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactLineAfter(\"Telephone:\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactBetween(\"No:\", \"Fax\", \"address\", 6, \"Applicant information was found\");\n" +
" section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" +
" end";
when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules));
droolsExecutionService.updateRules();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf");
when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt")))
.build();
when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(18);
}
}
@Test
public void headerPropagation() throws IOException {
@ -268,7 +326,7 @@ public class EntityRedactionServiceTest {
" section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" +
" section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" +
" end";
when(rulesClient.getVersion()).thenReturn(1L);
when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet());
when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules));
TypeResponse typeResponse = TypeResponse.builder()
.types(Arrays.asList(

View File

@ -49,7 +49,7 @@ rule "5: Do not redact in guideline sections"
section.redactNot("address", 5, "Section is a guideline section.");
end
rule "6: Redact contact information, if applicant is found"
rule "6: Redact contact information if applicant is found"
when
eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant"));
then
@ -70,7 +70,7 @@ rule "6: Redact contact information, if applicant is found"
section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found");
end
rule "7: Redact contact information, if Producer is found"
rule "7: Redact contact information if Producer is found"
when
eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance"));
then
@ -110,7 +110,7 @@ rule "9: Redact if must redact entry is found"
end
rule "10: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study"
rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate study"
when
Section(rowEquals("Vertebrate study Y/N", "Y"))
then