RSS-42: Made redactBetween rule more generic

This commit is contained in:
deiflaender 2022-09-07 14:31:08 +02:00
parent 9867cd6848
commit c29c5eef0d
35 changed files with 60 additions and 3 deletions

View File

@ -998,7 +998,17 @@ public class Section {
private void redactBetween(String start, String stop, String asType, int ruleNumber, boolean redactEverywhere,
String reason, String legalBasis, boolean redaction) {
String[] values = StringUtils.substringsBetween(searchText, start, stop);
String[] values = new String[1];
if(start.isEmpty() && stop.isEmpty()){
values[0] = searchText;
} else if(start.isEmpty() && searchText.contains(stop)){
values[0] = StringUtils.substringBefore(searchText, stop);
} else if (stop.isEmpty() && searchText.contains(start)){
values[0] = StringUtils.substringAfter(searchText, start);
} else {
values = StringUtils.substringsBetween(searchText, start, stop);
}
if (values != null) {
for (String value : values) {

View File

@ -330,6 +330,36 @@ public class RedactionIntegrationTest {
}
@Test
public void titleExtraction() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/RSS/32 - Emamectin Benzoate Technical - Acute Oral Toxicity - Mouse.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var text = redactionStorageService.getText(TEST_DOSSIER_ID, TEST_FILE_ID);
AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
}
@Test
@Ignore
@SneakyThrows
@ -1064,7 +1094,7 @@ public class RedactionIntegrationTest {
public void classificationTest() throws IOException {
System.out.println("classificationTest");
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/RSS/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());

View File

@ -390,4 +390,21 @@ rule "101: Redact CAS numbers"
Section(hasTableHeader("Sample #"))
then
section.redactCell("Sample #", 8, "PII", true, "Redacted because row is a vertebrate study", "Reg (EC) No 1107/2009 Art. 63 (2g)");
end
end
rule "102: Extract title"
when
Section(sectionNumber <= 3 && !text.contains("AUTHOR"))
then
section.redactBetween("", "Final Report", "PII", 5, false, "Title found", "n-a");
section.redactBetween("", "FINAL REPORT", "PII", 5, false, "Title found", "n-a");
section.redactBetween("TITLE", "DATA REQUIREMENT", "PII", 5, false, "Title found", "n-a");
section.redactBetween("TITLE", "DATA REQUIREMENT", "PII", 5, false, "Title found", "n-a");
end
rule "102-1: Extract title"
when
Section(sectionNumber <= 3 && text.contains("SPL"))
then
section.redactBetween("Laboratories", "SPL", "PII", 5, false, "Title found", "n-a");
end