RED-740: Improved section recognition
This commit is contained in:
parent
0f0f6b3a2e
commit
4ef6e0e2ef
@ -72,7 +72,7 @@ public class ClassificationService {
|
|||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
}else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
textBlock.setClassification("H " + (headlineFontSizes.size() + 1));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -333,7 +333,7 @@ public class RedactionIntegrationTest {
|
|||||||
|
|
||||||
System.out.println("redactionTest");
|
System.out.println("redactionTest");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Primicarb/74 Pirimicarb_RAR_01_Volume_1_2017-12-04.pdf");
|
||||||
|
|
||||||
RedactionRequest request = RedactionRequest.builder()
|
RedactionRequest request = RedactionRequest.builder()
|
||||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||||
@ -431,7 +431,7 @@ public class RedactionIntegrationTest {
|
|||||||
public void classificationTest() throws IOException {
|
public void classificationTest() throws IOException {
|
||||||
|
|
||||||
System.out.println("classificationTest");
|
System.out.println("classificationTest");
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf");
|
||||||
|
|
||||||
RedactionRequest request = RedactionRequest.builder()
|
RedactionRequest request = RedactionRequest.builder()
|
||||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||||
|
|||||||
@ -124,76 +124,114 @@ rule "11: Redacted PII Personal Identification Information"
|
|||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
rule "12: Redact contact information if applicant is found"
|
rule "12: Redact contact information"
|
||||||
|
when
|
||||||
|
Section(text.contains("Contact point:")
|
||||||
|
|| text.contains("Phone:")
|
||||||
|
|| text.contains("Fax:")
|
||||||
|
|| text.contains("Tel.:")
|
||||||
|
|| text.contains("Tel:")
|
||||||
|
|| text.contains("E-mail:")
|
||||||
|
|| text.contains("Email:")
|
||||||
|
|| text.contains("e-mail:")
|
||||||
|
|| text.contains("E-mail address:")
|
||||||
|
|| text.contains("Alternative contact:")
|
||||||
|
|| text.contains("Telephone number:")
|
||||||
|
|| text.contains("Telephone No:")
|
||||||
|
|| text.contains("Fax number:")
|
||||||
|
|| text.contains("Telephone:")
|
||||||
|
|| text.contains("European contact:"))
|
||||||
|
then
|
||||||
|
section.redactLineAfter("Contact point:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Phone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Fax:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Tel:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("E-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Email:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("e-mail:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("E-mail address:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Alternative contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Telephone number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Telephone No:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Fax number:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("Telephone:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactBetween("No:", "Fax", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
section.redactLineAfter("European contact:", "PII", 12, true, "Contact information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
rule "13: Redact contact information if applicant is found"
|
||||||
when
|
when
|
||||||
Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:"))
|
Section(headlineContainsWord("applicant") || text.contains("Applicant") || headlineContainsWord("Primary contact") || headlineContainsWord("Alternative contact") || text.contains("Contact:") || text.contains("Telephone number:"))
|
||||||
then
|
then
|
||||||
section.redactLineAfter("Contact point:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Contact point:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Phone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Phone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Fax:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Fax:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Tel:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Tel:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("E-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("E-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Email:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Email:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("e-mail:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("e-mail:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("E-mail address:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("E-mail address:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Alternative contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Alternative contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Telephone number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Telephone number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Telephone No:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Telephone No:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Fax number:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Fax number:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Telephone:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Telephone:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Company:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactBetween("No:", "Fax", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactBetween("No:", "Fax", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactBetween("Contact:", "Tel.:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactBetween("Contact:", "Tel.:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("European contact:", "PII", 13, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("European contact:", "PII", 12, true, "Applicant information was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
rule "13: Redact contact information if Producer is found"
|
rule "14: Redact contact information if Producer is found"
|
||||||
when
|
when
|
||||||
Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance"))
|
Section(text.toLowerCase().contains("producer of the plant protection") || text.toLowerCase().contains("producer of the active substance") || text.contains("Manufacturer of the active substance") || text.contains("Manufacturer:") || text.contains("Producer or producers of the active substance"))
|
||||||
then
|
then
|
||||||
section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Telephone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Telephone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Phone:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Phone:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Fax:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Fax:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("E-mail:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("E-mail:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Contact:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Contact:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Fax number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Fax number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Telephone number:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Telephone number:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactLineAfter("Tel:", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLineAfter("Tel:", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
section.redactBetween("No:", "Fax", "PII", 13, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactBetween("No:", "Fax", "PII", 14, true, "Producer was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
rule "14: Redact AUTHOR(S)"
|
rule "15: Redact AUTHOR(S)"
|
||||||
when
|
when
|
||||||
Section(searchText.contains("AUTHOR(S):"))
|
Section(searchText.contains("AUTHOR(S):"))
|
||||||
then
|
then
|
||||||
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 14, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactLinesBetween("AUTHOR(S):", "COMPLETION DATE:", "PII", 15, true, "AUTHOR(S) was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
rule "15: Redact PERFORMING LABORATORY"
|
rule "16: Redact PERFORMING LABORATORY"
|
||||||
when
|
when
|
||||||
Section(searchText.contains("PERFORMING LABORATORY:"))
|
Section(searchText.contains("PERFORMING LABORATORY:"))
|
||||||
then
|
then
|
||||||
section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 15, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactBetween("PERFORMING LABORATORY:", "LABORATORY PROJECT ID:", "PII", 16, true, "PERFORMING LABORATORY was found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
rule "16: Redact On behalf of Sequani Ltd.:"
|
rule "17: Redact On behalf of Sequani Ltd.:"
|
||||||
when
|
when
|
||||||
Section(searchText.contains("On behalf of Sequani Ltd.: Name Title"))
|
Section(searchText.contains("On behalf of Sequani Ltd.: Name Title"))
|
||||||
then
|
then
|
||||||
section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 16, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactBetween("On behalf of Sequani Ltd.: Name Title", "On behalf of", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
rule "17: Redact On behalf of Syngenta Ltd.:"
|
rule "18: Redact On behalf of Syngenta Ltd.:"
|
||||||
when
|
when
|
||||||
Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title"))
|
Section(searchText.contains("On behalf of Syngenta Ltd.: Name Title"))
|
||||||
then
|
then
|
||||||
section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 17, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
section.redactBetween("On behalf of Syngenta Ltd.: Name Title", "Study dates", "PII", 18, false , "PII (Personal Identification Information) found", "Reg (EC) No 1107/2009 Art. 63 (2e)");
|
||||||
end
|
end
|
||||||
Loading…
x
Reference in New Issue
Block a user