RED-740: Improved section recognition

This commit is contained in:
deiflaender 2020-11-27 15:39:31 +01:00
parent 0f0f6b3a2e
commit 4ef6e0e2ef
4 changed files with 80 additions and 42 deletions

View File

@ -72,7 +72,7 @@ public class ClassificationService {
document.setHeadlines(true);
}
}
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
document.setHeadlines(true);
}

View File

@ -333,7 +333,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Primicarb/74 Pirimicarb_RAR_01_Volume_1_2017-12-04.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
@ -431,7 +431,7 @@ public class RedactionIntegrationTest {
public void classificationTest() throws IOException {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))

View File

@ -124,76 +124,114 @@ rule "11: Redacted PII Personal Identification Information"
end
rule "12: Redact contact information if applicant is found"
rule "12: Redact contact information"
when
Section(text.contains("Contact point:")
|| text.contains("Phone:")
|| text.contains("Fax:")
|| text.contains("Tel.:")
|| text.contains("Tel:")
|| text.contains("E-mail:")
|| text.contains("Email:")
|| text.contains("e-mail:")
|| text.contains("E-mail address:")
|| text.contains("Alternative contact:")
|| text.contains("Telephone number:")
|| text.contains("Telephone No:")
|| text.contains("Fax number:")
|| text.contains("Telephone:")
|| text.contains("European contact:"))
then
section.redactLineAfter("Contact point:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Phone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Email:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("e-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail address:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Alternative contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone No:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("No:", "Fax", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("European contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end
rule "13: Redact contact information if applicant is found"
when
Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:"))
then
section.redactLineAfter("Contact point:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Phone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Email:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("e-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail address:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Alternative contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone No:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Company:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("No:", "Fax", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("European contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact point:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Phone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Email:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("e-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail address:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Alternative contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone No:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("No:", "Fax", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("Contact:", "Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("European contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end
rule "13: Redact contact information if Producer is found"
rule "14: Redact contact information if Producer is found"
when
Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance"))
then
section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Phone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("No:", "Fax", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Phone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("E-mail:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Fax number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Telephone number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLineAfter("Tel:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("No:", "Fax", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end
rule "14: Redact AUTHOR(S)"
rule "15: Redact AUTHOR(S)"
when
Section(searchText.contains("AUTHOR(S):"))
then
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 14, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 15, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end
rule "15: Redact PERFORMING LABORATORY"
rule "16: Redact PERFORMING LABORATORY"
when
Section(searchText.contains("PERFORMING LABORATORY:"))
then
section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 15, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 16, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end
rule "16: Redact On behalf of Sequani Ltd.:"
rule "17: Redact On behalf of Sequani Ltd.:"
when
Section(searchText.contains("On behalf of Sequani Ltd.: Name Title"))
then
section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 16, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end
rule "17: Redact On behalf of Syngenta Ltd.:"
rule "18: Redact On behalf of Syngenta Ltd.:"
when
Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title"))
then
section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 18, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
end