DM-307: Implemented rule function byRegexWithLinebreaks #23

Merged
dominique.eiflaender1 merged 1 commits from DM-307-3 into master 2023-06-29 12:03:24 +02:00
13 changed files with 106 additions and 38 deletions

View File

@ -146,6 +146,15 @@ public class EntityCreationService {
.map(boundary -> byBoundary(boundary, type, entityType, node));
}
public Stream<RedactionEntity> byRegexWithLinebreaks(String regexPattern, String type, EntityType entityType, SemanticNode node) {
return byRegexWithLinebreaks(regexPattern, type, entityType, 0, node);
}
public Stream<RedactionEntity> byRegexWithLinebreaksIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) {
return byRegexWithLinebreaksIgnoreCase(regexPattern, type, entityType, 0, node);
}
public Stream<RedactionEntity> byRegex(String regexPattern, String type, EntityType entityType, SemanticNode node) {
@ -159,6 +168,17 @@ public class EntityCreationService {
}
public Stream<RedactionEntity> byRegexWithLinebreaks(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findBoundariesByRegexWithLinebreaks(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node));
}
public Stream<RedactionEntity> byRegexWithLinebreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findBoundariesByRegexWithLinebreaksIgnoreCase(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node));
}
public Stream<RedactionEntity> byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
return RedactionSearchUtility.findBoundariesByRegex(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node));

View File

@ -102,6 +102,20 @@ public class RedactionSearchUtility {
}
public static List<Boundary> findBoundariesByRegexWithLinebreaks(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, false);
return getBoundariesByPatternWithLinebreaks(textBlock, group, pattern);
}
public static List<Boundary> findBoundariesByRegexWithLinebreaksIgnoreCase(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, true);
return getBoundariesByPatternWithLinebreaks(textBlock, group, pattern);
}
public static List<Boundary> findBoundariesByRegexIgnoreCase(String regexPattern, int group, TextBlock textBlock) {
Pattern pattern = Patterns.getCompiledPattern(regexPattern, true);
@ -120,6 +134,29 @@ public class RedactionSearchUtility {
}
private static List<Boundary> getBoundariesByPatternWithLinebreaks(TextBlock textBlock, int group, Pattern pattern) {
StringBuilder stringBuilder = new StringBuilder();
textBlock.getAtomicTextBlocks().forEach(at -> {
if (at.numberOfLines() > 1) {
for (int i = 0; i < at.numberOfLines(); i++) {
stringBuilder.append(at.getLine(i));
}
stringBuilder.setCharAt(stringBuilder.length() - 1, '\n');
} else {
stringBuilder.append(at.getSearchText()).setCharAt(stringBuilder.length() - 1, '\n');
}
});
Matcher matcher = pattern.matcher(stringBuilder.toString());
List<Boundary> boundaries = new LinkedList<>();
while (matcher.find()) {
boundaries.add(new Boundary(matcher.start(group) + textBlock.getBoundary().start(), matcher.end(group) + textBlock.getBoundary().start()));
}
return boundaries;
}
public static List<Boundary> findBoundariesByString(String searchString, TextBlock textBlock) {
List<Boundary> boundaries = new LinkedList<>();

View File

@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVitro.pdf");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
System.out.println("Finished structure analysis");

View File

@ -320,29 +320,35 @@ rule "5: Strain"
end
rule "7: study title by document structure"
when
$table: Table(isOnPage(1),
(containsString("Final Report") || containsString("SPL")),
numberOfRows == 1,
numberOfCols == 1)
then
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
entity.setRedactionReason("Study title found");
entity.setLegalBasis("n-a");
entity.setRedaction(true);
entity.addMatchedRule("7");
});
end
//rule "7: study title by document structure"
// when
// $table: Table(isOnPage(1),
// (containsString("Final Report") || containsString("SPL")),
// numberOfRows == 1,
// numberOfCols == 1)
// then
//
// entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
// entity.setRedactionReason("Study title found");
// entity.setLegalBasis("n-a");
// entity.setRedaction(true);
// entity.addMatchedRule("7");
// });
// end
rule "7: study title old"
when
$section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL")))
then
// TODO
// section.redactByRegExWithNewlines("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", true, 0, "title", 7, "Study title found", "n-a");
entityCreationService.byRegexWithLinebreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section)
.forEach(entity -> {
entity.setRedactionReason("Title found");
entity.setLegalBasis("n-a");
entity.setRedaction(true);
entity.addMatchedRule("7");
});
entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
entity.setRedactionReason("Title found");
@ -447,7 +453,6 @@ rule "8c: Performing Laboratory (Country & Name) from dict"
// Headline not found because of ocr.
rule "9: GLP Study"
when
$headline: Headline(containsString("GOOD LABORATORY PRACTICE COMPLIANCE")
@ -694,8 +699,8 @@ rule "14: Dosages"
FileAttribute(label == "OECD Number", value == "425")
$section: Section(
(
getHeadline().containsString("Dosages")
|| getHeadline().containsString("Study Design")
anyHeadlineContainsString("Dosages")
|| anyHeadlineContainsString("Study Design")
)
&& !getHeadline().containsString("TABLE")
)
@ -714,7 +719,13 @@ rule "14: Dosages"
entity.addMatchedRule("14");
});
//TODO section.redactByRegExWithNewlines("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])",true, 1, "dosages", 14, "Dosage found", "n-a");
entityCreationService.byRegexWithLinebreaks("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])", "dosages", EntityType.ENTITY,1, $section)
.forEach(entity -> {
entity.setRedactionReason("Dosage found");
entity.setLegalBasis("n-a");
entity.setRedaction(true);
entity.addMatchedRule("14");
});
end
rule "15: Mortality"
@ -751,7 +762,7 @@ rule "18: Weight Behavior Changes"
when
FileAttribute(label == "OECD Number", value == "402")
$section: Section(
getHeadline().containsString("Results")
getHeadline().containsStringIgnoreCase("Results")
&& (
containsString("body weight")
|| containsString("body weights")
@ -776,8 +787,8 @@ rule "19: Necropsy findings"
|| getHeadline().containsString("Macroscopic Findings")
|| getHeadline().containsString("Macroscopic examination")
)
&& !getHeadline().containsString("Table")
&& !getHeadline().containsString("Appendix")
&& !getHeadline().containsStringIgnoreCase("Table")
&& !getHeadline().containsStringIgnoreCase("Appendix")
)
then
var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "necropsy_findings", EntityType.ENTITY, $section);
@ -793,13 +804,13 @@ rule "22: Clinical observations"
FileAttribute(label == "OECD Number", value == "403")
$section: Section(
(
getHeadline().containsString("Clinical Observations")
|| getHeadline().containsString("Clinical observations")
|| getHeadline().containsString("In-life Observations")
|| getHeadline().containsString("Postmortem Observations")
anyHeadlineContainsStringIgnoreCase("Clinical Observations")
|| anyHeadlineContainsStringIgnoreCase("Clinical observations")
|| anyHeadlineContainsStringIgnoreCase("In-life Observations")
|| anyHeadlineContainsStringIgnoreCase("Postmortem Observations")
)
&& !getHeadline().containsString("Appendix")
&& !getHeadline().containsString("Table")
&& !anyHeadlineContainsStringIgnoreCase("Appendix")
&& !anyHeadlineContainsStringIgnoreCase("Table")
)
then
@ -872,8 +883,8 @@ rule "23: Bodyweight changes"
|| getHeadline().containsString("Body Weights")
|| getHeadline().containsString("Body Weight")
)
&& !getHeadline().containsString("Appendix")
&& !getHeadline().containsString("TABLE")
&& !getHeadline().containsStringIgnoreCase("Appendix")
&& !getHeadline().containsStringIgnoreCase("TABLE")
&& hasParagraphs()
)
then
@ -889,7 +900,7 @@ rule "24: Study Design"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
$section: Section(
getHeadline().containsString("study design")
anyHeadlineContainsStringIgnoreCase("study design")
)
then
var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "study_design", EntityType.ENTITY, $section);
@ -924,7 +935,7 @@ rule "26: Detailing (404 & 405)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405"))
$section: Section(
getHeadline().containsString("Results") && !getHeadline().containsString("Evaluation") && !getHeadline().containsString("study")
getHeadline().containsStringIgnoreCase("Results") && !getHeadline().containsStringIgnoreCase("Evaluation") && !getHeadline().containsStringIgnoreCase("study") && hasParagraphs()
)
then
var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "detailing", EntityType.ENTITY, $section);
@ -991,10 +1002,10 @@ rule "35: Sex"
FileAttribute(label == "OECD Number", valueEqualsAnyOf("405","429"))
$section: Section(
(
getHeadline().containsString("animal")
|| getHeadline().containsString("test system")
getHeadline().containsStringIgnoreCase("animal")
|| getHeadline().containsStringIgnoreCase("test system")
)
&& !getHeadline().containsString("selection")
&& !getHeadline().containsStringIgnoreCase("selection")
&& (
containsStringIgnoreCase("sex:")
|| containsStringIgnoreCase("male")