RED-740: Improved section recognition
This commit is contained in:
parent
0f0f6b3a2e
commit
4ef6e0e2ef
@ -72,7 +72,7 @@ public class ClassificationService {
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
|
||||
@ -333,7 +333,7 @@ public class RedactionIntegrationTest {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Primicarb/74 Pirimicarb_RAR_01_Volume_1_2017-12-04.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
@ -431,7 +431,7 @@ public class RedactionIntegrationTest {
|
||||
public void classificationTest() throws IOException {
|
||||
|
||||
System.out.println("classificationTest");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
|
||||
|
||||
RedactionRequest request = RedactionRequest.builder()
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
|
||||
@ -124,76 +124,114 @@ rule "11: Redacted PII Personal Identification Information"
|
||||
end
|
||||
|
||||
|
||||
rule "12: Redact contact information if applicant is found"
|
||||
rule "12: Redact contact information"
|
||||
when
|
||||
Section(text.contains("Contact point:")
|
||||
|| text.contains("Phone:")
|
||||
|| text.contains("Fax:")
|
||||
|| text.contains("Tel.:")
|
||||
|| text.contains("Tel:")
|
||||
|| text.contains("E-mail:")
|
||||
|| text.contains("Email:")
|
||||
|| text.contains("e-mail:")
|
||||
|| text.contains("E-mail address:")
|
||||
|| text.contains("Alternative contact:")
|
||||
|| text.contains("Telephone number:")
|
||||
|| text.contains("Telephone No:")
|
||||
|| text.contains("Fax number:")
|
||||
|| text.contains("Telephone:")
|
||||
|| text.contains("European contact:"))
|
||||
then
|
||||
section.redactLineAfter("Contact point:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Phone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Email:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("e-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail address:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Alternative contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone No:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("No:", "Fax", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("European contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
|
||||
|
||||
rule "13: Redact contact information if applicant is found"
|
||||
when
|
||||
Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:"))
|
||||
then
|
||||
section.redactLineAfter("Contact point:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Phone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Email:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("e-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail address:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Alternative contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone No:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Company:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("No:", "Fax", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("European contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact point:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Phone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Email:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("e-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail address:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Alternative contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone No:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("No:", "Fax", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("Contact:", "Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("European contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
|
||||
|
||||
rule "13: Redact contact information if Producer is found"
|
||||
rule "14: Redact contact information if Producer is found"
|
||||
when
|
||||
Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance"))
|
||||
then
|
||||
section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Phone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("No:", "Fax", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Phone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("E-mail:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Fax number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Telephone number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLineAfter("Tel:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("No:", "Fax", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
|
||||
|
||||
rule "14: Redact AUTHOR(S)"
|
||||
rule "15: Redact AUTHOR(S)"
|
||||
when
|
||||
Section(searchText.contains("AUTHOR(S):"))
|
||||
then
|
||||
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 14, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 15, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
|
||||
|
||||
rule "15: Redact PERFORMING LABORATORY"
|
||||
rule "16: Redact PERFORMING LABORATORY"
|
||||
when
|
||||
Section(searchText.contains("PERFORMING LABORATORY:"))
|
||||
then
|
||||
section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 15, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 16, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
|
||||
|
||||
rule "16: Redact On behalf of Sequani Ltd.:"
|
||||
rule "17: Redact On behalf of Sequani Ltd.:"
|
||||
when
|
||||
Section(searchText.contains("On behalf of Sequani Ltd.: Name Title"))
|
||||
then
|
||||
section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 16, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
|
||||
|
||||
rule "17: Redact On behalf of Syngenta Ltd.:"
|
||||
rule "18: Redact On behalf of Syngenta Ltd.:"
|
||||
when
|
||||
Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title"))
|
||||
then
|
||||
section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 18, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||
end
|
||||
Loading…
x
Reference in New Issue
Block a user