DM-307: Changed rule to applyWithLineBreaks, fixes applyWithLineBreaks #37

Merged
dominique.eiflaender1 merged 1 commits from DM-307 into master 2023-07-06 14:27:15 +02:00
3 changed files with 43 additions and 61 deletions

View File

@ -7,6 +7,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -210,7 +211,7 @@ public class AtomicTextBlock implements TextBlock {
}
CharSequence subSequence = subSequence(boundary);
Set<Integer> lbInBoundary = lineBreaks.stream().filter(boundary::contains).collect(Collectors.toSet());
Set<Integer> lbInBoundary = new HashSet<>(lineBreaks);
if (boundary.end() == getBoundary().end()) {
lbInBoundary.add(getBoundary().length());
}

View File

@ -44,7 +44,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A8591B/15-Curacron_ToxicidadeAgudaOral.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/425_F.1.1.1 - A13617AV - Acute Oral Toxicity Study.pdf");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));

View File

@ -292,6 +292,7 @@ rule "DOC.4.3: Species"
then
$section.getEntitiesOfType("species").forEach(entity -> {
entity.apply("DOC.4.3", "Species found.", "n-a");
entity.setValue(entity.getValue().toLowerCase());
});
end
@ -313,18 +314,7 @@ rule "DOC.5.0: Strain"
entity.apply("DOC.5.0", "Strain found.", "n-a");
});
end
rule "DOC.6.0"
when
Headline(containsStringIgnoreCase("materials and methods"), $sectionIdentifierMaterials: getSectionIdentifier())
Headline(containsStringIgnoreCase("controls"), getSectionIdentifier().isChildOf($sectionIdentifierMaterials), $sectionIdentifierControls: getSectionIdentifier())
$headline: Headline(containsStringIgnoreCase("positive control substances"), getSectionIdentifier().isChildOf($sectionIdentifierControls))
then
System.out.println($headline);
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "irgendwas", EntityType.ENTITY)
.forEach(entity -> {
entity.apply("DOC.6.0", "positive control substance found", "n-a");
});
end
//rule "DOC.7.0: study title by document structure"
// when
@ -507,7 +497,7 @@ rule "DOC.11.0: Conclusions - LD50, LC50, Confidence"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","425","436"))
$section: Section(
(getHeadline().containsString("Conclusion") || getHeadline().containsString("Lethality"))
(getHeadline().containsStringIgnoreCase("Conclusion") || anyHeadlineContainsStringIgnoreCase("Lethality"))
&& (containsString("LD") || containsString("LC") || containsString("50") || containsString("LD50") || containsString("lethal concentration") || containsString("lethal dose"))
&& (
containsString("greater than")
@ -566,7 +556,7 @@ rule "DOC.12.1: Guideline Deviation in text"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436","471"))
$section: Section(
getHeadline().containsString("Introduction")
getHeadline().containsStringIgnoreCase("Introduction")
&& containsStringIgnoreCase("deviations from the protocol")
)
then
@ -585,11 +575,10 @@ rule "DOC.13.0: Clinical Signs"
|| getHeadline().containsString("Macroscopic Findings")
)
&& !getHeadline().containsString("TABLE")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_signs", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.13.0", "Clinical Signs found", "n-a"));
end
@ -615,11 +604,11 @@ rule "DOC.14.0: Dosages"
rule "DOC.15.0: Mortality"
when
$headline: Headline(containsString("Mortality") && !containsString("TABLE") && hasParagraphs())
$headline: Headline(containsString("Mortality") && !containsString("TABLE"))
FileAttribute(label == "OECD Number", value == "425")
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.15.0", "Mortality found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.15.0", "Mortality found", "n-a"));
end
@ -627,12 +616,11 @@ rule "DOC.17.0: Study Conclusion"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436","471"))
$section: Section(
getHeadline().containsString("Conclusion")
&& hasParagraphs()
getHeadline().containsStringIgnoreCase("Conclusion")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.17.0", "Study Conclusion found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.17.0", "Study Conclusion found", "n-a"));
end
@ -647,11 +635,10 @@ rule "DOC.18.0: Weight Behavior Changes"
|| containsString("bodyweight")
|| containsString("bodyweights")
)
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "weight_behavior_changes", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.18.0", "Weight behavior changes found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.18.0", "Weight behavior changes found", "n-a"));
end
@ -666,11 +653,10 @@ rule "DOC.19.0: Necropsy findings"
)
&& !getHeadline().containsStringIgnoreCase("Table")
&& !getHeadline().containsStringIgnoreCase("Appendix")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "necropsy_findings", EntityType.ENTITY)
.forEach( entity -> entity.apply("DOC.19.0", "Necropsy section found", "n-a"));
.forEach( entity -> entity.applyWithLineBreaks("DOC.19.0", "Necropsy section found", "n-a"));
end
@ -686,11 +672,10 @@ rule "DOC.22.0: Clinical observations"
)
&& !anyHeadlineContainsStringIgnoreCase("Appendix")
&& !anyHeadlineContainsStringIgnoreCase("Table")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_observations", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.22.0", "Clinical observations section found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.22.0", "Clinical observations section found", "n-a"));
end
@ -743,11 +728,10 @@ rule "DOC.23.0: Bodyweight changes"
)
&& !getHeadline().containsStringIgnoreCase("Appendix")
&& !getHeadline().containsStringIgnoreCase("TABLE")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "bodyweight_changes", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.23.0", "Bodyweight section found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.23.0", "Bodyweight section found", "n-a"));
end
@ -756,11 +740,10 @@ rule "DOC.24.0: Study Design"
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
$section: Section(
anyHeadlineContainsStringIgnoreCase("study design")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_design", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.24.0", "Study design section found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.24.0", "Study design section found", "n-a"));
end
@ -778,11 +761,10 @@ rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
&& !getHeadline().containsString("CONCLUSIONS")
&& !getHeadline().containsString("Interpretation")
&& !getHeadline().containsString("Viability")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.25.0", "Results and Conclusion found", "n-a"));
end
@ -795,15 +777,10 @@ rule "DOC.26.0: Detailing (404 & 405)"
&& !getHeadline().containsStringIgnoreCase("Evaluation")
&& !getHeadline().containsStringIgnoreCase("study")
&& !getHeadline().containsStringIgnoreCase("discussion")
&& hasParagraphs()
)
then
var paragraphs = $section.streamAllSubNodesOfType(NodeType.PARAGRAPH).toList();
for(var p : paragraphs){
entityCreationService.bySemanticNode(p, "detailing", EntityType.ENTITY).ifPresent(entity -> {
entity.apply("DOC.26.0", "Detailing found", "n-a");
});
}
entityCreationService.bySemanticNodeParagraphsOnly($section, "detailing", EntityType.ENTITY)
.forEach(entity -> entity.applyWithLineBreaks("DOC.26.0", "Detailing found", "n-a"));
end
@ -813,21 +790,20 @@ rule "DOC.32.0: Preliminary Test Results (429)"
$section: Section(
((getHeadline().containsString("Preliminary Screening Test") && containsString("Clinical observations"))
|| getHeadline().containsString("Pre-Experiment"))
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.32.0", "Preliminary Test Results found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.32.0", "Preliminary Test Results found", "n-a"));
end
rule "DOC.33.0: Test Results (429)"
when
FileAttribute(label == "OECD Number", value == "429")
$section: Section((getHeadline().containsString("RESULTS AND DISCUSSION") || getHeadline().containsString("Estimation of the proliferative response of lymph node cells") || getHeadline().containsString("Results in the Main Experiment")) && hasParagraphs())
$section: Section((getHeadline().containsString("RESULTS AND DISCUSSION") || getHeadline().containsString("Estimation of the proliferative response of lymph node cells") || getHeadline().containsString("Results in the Main Experiment")))
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "test_results", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.33.0", "Test Results found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.33.0", "Test Results found", "n-a"));
end
@ -959,11 +935,10 @@ rule "DOC.39.0: Dilution of the test substance"
$section: Section(
getHeadline().containsString("Formulation")
&& containsString("dilution")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "dilution", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.39.0", "Dilution found.", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.39.0", "Dilution found.", "n-a"));
end
@ -973,21 +948,20 @@ rule "DOC.40.0: Positive Control"
$section: Section(
getHeadline().containsStringIgnoreCase("Positive Control")
&& !(getHeadline().containsStringIgnoreCase("Appendix") || getHeadline().containsStringIgnoreCase("Table"))
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "positive_control", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.40.0", "Positive control found.", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.40.0", "Positive control found.", "n-a"));
end
rule "DOC.42.0: Mortality Statement"
when
FileAttribute(label == "OECD Number", value == "402")
$headline: Headline(containsString("Mortality") && !containsString("TABLE") && hasParagraphs())
$headline: Headline(containsString("Mortality") && !containsString("TABLE"))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality_statement", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.42.0", "Mortality Statement found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.42.0", "Mortality Statement found", "n-a"));
end
@ -1038,16 +1012,11 @@ rule "DOC.44.0: Results (Main Study)"
$section: Section(
getHeadline().containsString("Results")
&& getHeadline().getBoundary().length() < 20
&& hasParagraphs()
&& !(getHeadline().containsString("Appendix") || getHeadline().containsString("Table"))
)
then
var paragraphs = $section.streamAllSubNodesOfType(NodeType.PARAGRAPH).toList();
for(var p : paragraphs){
entityCreationService.bySemanticNode(p, "results_(main_study)", EntityType.ENTITY).ifPresent(entity -> {
entity.apply("DOC.44.0", "Results for main study found.", "n-a");
});
}
entityCreationService.bySemanticNodeParagraphsOnly($section, "results_(main_study)", EntityType.ENTITY)
.forEach(entity -> entity.applyWithLineBreaks("DOC.44.0", "Results for main study found.", "n-a"));
end
@ -1056,13 +1025,25 @@ rule "DOC.45.0: Doses (mg/kg bodyweight)"
FileAttribute(label == "OECD Number", value == "402")
$section: Section(
anyHeadlineContainsStringIgnoreCase("study design")
&& hasParagraphs()
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "doses_(mg_kg_bw)", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.45.0", "Doses per bodyweight information found", "n-a"));
.forEach(entity -> entity.applyWithLineBreaks("DOC.45.0", "Doses per bodyweight information found", "n-a"));
end
// This is just an example for new rules feature.
//rule "DOC.99.0"
// when
// Headline(containsStringIgnoreCase("materials and methods"), $sectionIdentifierMaterials: getSectionIdentifier())
// Headline(containsStringIgnoreCase("controls"), getSectionIdentifier().isChildOf($sectionIdentifierMaterials), $sectionIdentifierControls: getSectionIdentifier())
// $headline: Headline(containsStringIgnoreCase("positive control substances"), getSectionIdentifier().isChildOf($sectionIdentifierControls))
// then
// System.out.println($headline);
// entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "irgendwas", EntityType.ENTITY)
// .forEach(entity -> {
// entity.applyWithLineBreaks("DOC.6.0", "positive control substance found", "n-a");
// });
// end
//------------------------------------ Manual redaction rules ------------------------------------