diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index a5e7e0f4..b41f1123 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -52,12 +52,13 @@ public class EntityRedactionService { List tables = paragraph.getTables(); for (Table table : tables) { + boolean singleCellTable = table.getRowCount() == 1 && table.getColCount() == 1; for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); Map tabularData = new HashMap<>(); int start = 0; for (Cell cell : row) { - if (cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { + if (!singleCellTable && cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index f29e83d3..736c98bc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -57,6 +57,7 @@ public class EntityRedactionServiceTest { private static final String ADDRESS_CODE = "address"; private static final AtomicLong DICTIONARY_VERSION = new AtomicLong(); + private static final AtomicLong RULES_VERSION = new AtomicLong(); @MockBean private DictionaryClient dictionaryClient; @@ -69,6 +70,9 @@ public class EntityRedactionServiceTest { @Autowired private PdfSegmentationService pdfSegmentationService; + @Autowired + private DroolsExecutionService droolsExecutionService; + @TestConfiguration public static class RedactionIntegrationTestConfiguration { @@ -185,7 +189,7 @@ public class EntityRedactionServiceTest { try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); entityRedactionService.processDocument(classifiedDoc, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // two pages + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page assertThat(classifiedDoc.getEntities().get(1).stream() .filter(entity -> entity.getMatchedRule() == 9) .count()).isEqualTo(10); @@ -194,6 +198,60 @@ public class EntityRedactionServiceTest { } + @Test + public void testApplicantInTableRedaction() throws IOException { + + String tableRules = "package drools\n" + + "\n" + + "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" + + "\n" + + "global Section section\n" + + "rule \"6: Redact contact information if applicant is found\"\n" + + " when\n" + + " eval(section.headlineContainsWord(\"applicant\") || section.getText().contains(\"Applicant\"));\n" + + " then\n" + + " section.redactLineAfter(\"Name:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactBetween(\"Address:\", \"Contact\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Contact point:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Phone:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Fax:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Tel:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"E-mail:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Email:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Contact:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Telephone number:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Fax number:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactLineAfter(\"Telephone:\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactBetween(\"No:\", \"Fax\", \"address\", 6, \"Applicant information was found\");\n" + + " section.redactBetween(\"Contact:\", \"Tel.:\", \"address\", 6, \"Applicant information was found\");\n" + + " end"; + when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet()); + when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules)); + droolsExecutionService.updateRules(); + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf"); + when(dictionaryClient.getVersion()).thenReturn(DICTIONARY_VERSION.incrementAndGet()); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/names.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(new ArrayList<>(ResourceLoader.load("dictionaries/addresses.txt"))) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 6) + .count()).isEqualTo(18); + } + + } + + @Test public void headerPropagation() throws IOException { @@ -268,7 +326,7 @@ public class EntityRedactionServiceTest { " section.redact(\"address\", 9, \"Redacted because row is a vertebrate study\");\n" + " section.highlightCell(\"Vertebrate study Y/N\", 9, \"must_redact\");\n" + " end"; - when(rulesClient.getVersion()).thenReturn(1L); + when(rulesClient.getVersion()).thenReturn(RULES_VERSION.incrementAndGet()); when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules)); TypeResponse typeResponse = TypeResponse.builder() .types(Arrays.asList( diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index e461ff8b..c991b0b5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -49,7 +49,7 @@ rule "5: Do not redact in guideline sections" section.redactNot("address", 5, "Section is a guideline section."); end -rule "6: Redact contact information, if applicant is found" +rule "6: Redact contact information if applicant is found" when eval(section.headlineContainsWord("applicant") || section.getText().contains("Applicant")); then @@ -70,7 +70,7 @@ rule "6: Redact contact information, if applicant is found" section.redactBetween("Contact:", "Tel.:", "address", 6, "Applicant information was found"); end -rule "7: Redact contact information, if Producer is found" +rule "7: Redact contact information if Producer is found" when eval(section.getText().toLowerCase().contains("producer of the plant protection") || section.getText().toLowerCase().contains("producer of the active substance") || section.getText().contains("Manufacturer of the active substance") || section.getText().contains("Manufacturer:") || section.getText().contains("Producer or producers of the active substance")); then @@ -110,7 +110,7 @@ rule "9: Redact if must redact entry is found" end -rule "10: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" +rule "10: Redact Authors and Addresses in Reference Table if it is a Vertebrate study" when Section(rowEquals("Vertebrate study Y/N", "Y")) then diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Applicant Producer Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Applicant Producer Table.pdf new file mode 100644 index 00000000..7a878561 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Applicant Producer Table.pdf differ