diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java index eb2fa386..8de040e6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java @@ -72,7 +72,7 @@ public class ClassificationService { document.setHeadlines(true); } } - }else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + }else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification("H " + (headlineFontSizes.size() + 1)); document.setHeadlines(true); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 02883207..5f51f6e5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -333,7 +333,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Primicarb/74 Pirimicarb_RAR_01_Volume_1_2017-12-04.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) @@ -431,7 +431,7 @@ public class RedactionIntegrationTest { public void classificationTest() throws IOException { System.out.println("classificationTest"); - ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 30380f9f..fa8dd95e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -124,76 +124,114 @@ rule "11: Redacted PII Personal Identification Information" end -rule "12: Redact contact information if applicant is found" +rule "12: Redact contact information" + when + Section(text.contains("Contact point:") + || text.contains("Phone:") + || text.contains("Fax:") + || text.contains("Tel.:") + || text.contains("Tel:") + || text.contains("E-mail:") + || text.contains("Email:") + || text.contains("e-mail:") + || text.contains("E-mail address:") + || text.contains("Alternative contact:") + || text.contains("Telephone number:") + || text.contains("Telephone No:") + || text.contains("Fax number:") + || text.contains("Telephone:") + || text.contains("European contact:")) + then + section.redactLineAfter("Contact point:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Email:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("e-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail address:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Alternative contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone No:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("European contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + end + + +rule "13: Redact contact information if applicant is found" when Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:")) then - section.redactLineAfter("Contact point:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Phone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Email:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("e-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail address:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Alternative contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone No:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Company:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("No:", "Fax", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("European contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact point:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Email:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("e-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail address:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Alternative contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone No:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("Contact:", "Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("European contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "13: Redact contact information if Producer is found" +rule "14: Redact contact information if Producer is found" when Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance")) then - section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Phone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("E-mail:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Fax number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Telephone number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactLineAfter("Tel:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); - section.redactBetween("No:", "Fax", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Phone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("E-mail:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Fax number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Telephone number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLineAfter("Tel:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("No:", "Fax", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "14: Redact AUTHOR(S)" +rule "15: Redact AUTHOR(S)" when Section(searchText.contains("AUTHOR(S):")) then - section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 14, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 15, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "15: Redact PERFORMING LABORATORY" +rule "16: Redact PERFORMING LABORATORY" when Section(searchText.contains("PERFORMING LABORATORY:")) then - section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 15, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 16, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "16: Redact On behalf of Sequani Ltd.:" +rule "17: Redact On behalf of Sequani Ltd.:" when Section(searchText.contains("On behalf of Sequani Ltd.: Name Title")) then - section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 16, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end -rule "17: Redact On behalf of Syngenta Ltd.:" +rule "18: Redact On behalf of Syngenta Ltd.:" when Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title")) then - section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); + section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 18, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)"); end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Fludioxonil/182 Fludioxonil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Fludioxonil/182 Fludioxonil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf new file mode 100644 index 00000000..017b442b Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Fludioxonil/182 Fludioxonil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf differ