diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/AtomicTextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/AtomicTextBlock.java index b5ebdead..3504f77c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/AtomicTextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/AtomicTextBlock.java @@ -7,6 +7,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -210,7 +211,7 @@ public class AtomicTextBlock implements TextBlock { } CharSequence subSequence = subSequence(boundary); - Set lbInBoundary = lineBreaks.stream().filter(boundary::contains).collect(Collectors.toSet()); + Set lbInBoundary = new HashSet<>(lineBreaks); if (boundary.end() == getBoundary().end()) { lbInBoundary.add(getBoundary().length()); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index 226f7d8b..b43e83f5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -44,7 +44,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { @Test public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A8591B/15-Curacron_ToxicidadeAgudaOral.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/425_F.1.1.1 - A13617AV - Acute Oral Toxicity Study.pdf"); System.out.println("Start Full integration test"); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl index 342152fc..7270830c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl @@ -292,6 +292,7 @@ rule "DOC.4.3: Species" then $section.getEntitiesOfType("species").forEach(entity -> { entity.apply("DOC.4.3", "Species found.", "n-a"); + entity.setValue(entity.getValue().toLowerCase()); }); end @@ -313,18 +314,7 @@ rule "DOC.5.0: Strain" entity.apply("DOC.5.0", "Strain found.", "n-a"); }); end -rule "DOC.6.0" - when - Headline(containsStringIgnoreCase("materials and methods"), $sectionIdentifierMaterials: getSectionIdentifier()) - Headline(containsStringIgnoreCase("controls"), getSectionIdentifier().isChildOf($sectionIdentifierMaterials), $sectionIdentifierControls: getSectionIdentifier()) - $headline: Headline(containsStringIgnoreCase("positive control substances"), getSectionIdentifier().isChildOf($sectionIdentifierControls)) - then - System.out.println($headline); - entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "irgendwas", EntityType.ENTITY) - .forEach(entity -> { - entity.apply("DOC.6.0", "positive control substance found", "n-a"); - }); - end + //rule "DOC.7.0: study title by document structure" // when @@ -507,7 +497,7 @@ rule "DOC.11.0: Conclusions - LD50, LC50, Confidence" when FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","425","436")) $section: Section( - (getHeadline().containsString("Conclusion") || getHeadline().containsString("Lethality")) + (getHeadline().containsStringIgnoreCase("Conclusion") || anyHeadlineContainsStringIgnoreCase("Lethality")) && (containsString("LD") || containsString("LC") || containsString("50") || containsString("LD50") || containsString("lethal concentration") || containsString("lethal dose")) && ( containsString("greater than") @@ -566,7 +556,7 @@ rule "DOC.12.1: Guideline Deviation in text" when FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436","471")) $section: Section( - getHeadline().containsString("Introduction") + getHeadline().containsStringIgnoreCase("Introduction") && containsStringIgnoreCase("deviations from the protocol") ) then @@ -585,11 +575,10 @@ rule "DOC.13.0: Clinical Signs" || getHeadline().containsString("Macroscopic Findings") ) && !getHeadline().containsString("TABLE") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_signs", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.13.0", "Clinical Signs found", "n-a")); end @@ -615,11 +604,11 @@ rule "DOC.14.0: Dosages" rule "DOC.15.0: Mortality" when - $headline: Headline(containsString("Mortality") && !containsString("TABLE") && hasParagraphs()) + $headline: Headline(containsString("Mortality") && !containsString("TABLE")) FileAttribute(label == "OECD Number", value == "425") then entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.15.0", "Mortality found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.15.0", "Mortality found", "n-a")); end @@ -627,12 +616,11 @@ rule "DOC.17.0: Study Conclusion" when FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436","471")) $section: Section( - getHeadline().containsString("Conclusion") - && hasParagraphs() + getHeadline().containsStringIgnoreCase("Conclusion") ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "study_conclusion", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.17.0", "Study Conclusion found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.17.0", "Study Conclusion found", "n-a")); end @@ -647,11 +635,10 @@ rule "DOC.18.0: Weight Behavior Changes" || containsString("bodyweight") || containsString("bodyweights") ) - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "weight_behavior_changes", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.18.0", "Weight behavior changes found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.18.0", "Weight behavior changes found", "n-a")); end @@ -666,11 +653,10 @@ rule "DOC.19.0: Necropsy findings" ) && !getHeadline().containsStringIgnoreCase("Table") && !getHeadline().containsStringIgnoreCase("Appendix") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "necropsy_findings", EntityType.ENTITY) - .forEach( entity -> entity.apply("DOC.19.0", "Necropsy section found", "n-a")); + .forEach( entity -> entity.applyWithLineBreaks("DOC.19.0", "Necropsy section found", "n-a")); end @@ -686,11 +672,10 @@ rule "DOC.22.0: Clinical observations" ) && !anyHeadlineContainsStringIgnoreCase("Appendix") && !anyHeadlineContainsStringIgnoreCase("Table") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_observations", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.22.0", "Clinical observations section found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.22.0", "Clinical observations section found", "n-a")); end @@ -743,11 +728,10 @@ rule "DOC.23.0: Bodyweight changes" ) && !getHeadline().containsStringIgnoreCase("Appendix") && !getHeadline().containsStringIgnoreCase("TABLE") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "bodyweight_changes", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.23.0", "Bodyweight section found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.23.0", "Bodyweight section found", "n-a")); end @@ -756,11 +740,10 @@ rule "DOC.24.0: Study Design" FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487")) $section: Section( anyHeadlineContainsStringIgnoreCase("study design") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "study_design", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.24.0", "Study design section found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.24.0", "Study design section found", "n-a")); end @@ -778,11 +761,10 @@ rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)" && !getHeadline().containsString("CONCLUSIONS") && !getHeadline().containsString("Interpretation") && !getHeadline().containsString("Viability") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.25.0", "Results and Conclusion found", "n-a")); end @@ -795,15 +777,10 @@ rule "DOC.26.0: Detailing (404 & 405)" && !getHeadline().containsStringIgnoreCase("Evaluation") && !getHeadline().containsStringIgnoreCase("study") && !getHeadline().containsStringIgnoreCase("discussion") - && hasParagraphs() ) then - var paragraphs = $section.streamAllSubNodesOfType(NodeType.PARAGRAPH).toList(); - for(var p : paragraphs){ - entityCreationService.bySemanticNode(p, "detailing", EntityType.ENTITY).ifPresent(entity -> { - entity.apply("DOC.26.0", "Detailing found", "n-a"); - }); - } + entityCreationService.bySemanticNodeParagraphsOnly($section, "detailing", EntityType.ENTITY) + .forEach(entity -> entity.applyWithLineBreaks("DOC.26.0", "Detailing found", "n-a")); end @@ -813,21 +790,20 @@ rule "DOC.32.0: Preliminary Test Results (429)" $section: Section( ((getHeadline().containsString("Preliminary Screening Test") && containsString("Clinical observations")) || getHeadline().containsString("Pre-Experiment")) - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.32.0", "Preliminary Test Results found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.32.0", "Preliminary Test Results found", "n-a")); end rule "DOC.33.0: Test Results (429)" when FileAttribute(label == "OECD Number", value == "429") - $section: Section((getHeadline().containsString("RESULTS AND DISCUSSION") || getHeadline().containsString("Estimation of the proliferative response of lymph node cells") || getHeadline().containsString("Results in the Main Experiment")) && hasParagraphs()) + $section: Section((getHeadline().containsString("RESULTS AND DISCUSSION") || getHeadline().containsString("Estimation of the proliferative response of lymph node cells") || getHeadline().containsString("Results in the Main Experiment"))) then entityCreationService.bySemanticNodeParagraphsOnly($section, "test_results", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.33.0", "Test Results found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.33.0", "Test Results found", "n-a")); end @@ -959,11 +935,10 @@ rule "DOC.39.0: Dilution of the test substance" $section: Section( getHeadline().containsString("Formulation") && containsString("dilution") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "dilution", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.39.0", "Dilution found.", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.39.0", "Dilution found.", "n-a")); end @@ -973,21 +948,20 @@ rule "DOC.40.0: Positive Control" $section: Section( getHeadline().containsStringIgnoreCase("Positive Control") && !(getHeadline().containsStringIgnoreCase("Appendix") || getHeadline().containsStringIgnoreCase("Table")) - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "positive_control", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.40.0", "Positive control found.", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.40.0", "Positive control found.", "n-a")); end rule "DOC.42.0: Mortality Statement" when FileAttribute(label == "OECD Number", value == "402") - $headline: Headline(containsString("Mortality") && !containsString("TABLE") && hasParagraphs()) + $headline: Headline(containsString("Mortality") && !containsString("TABLE")) then entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality_statement", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.42.0", "Mortality Statement found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.42.0", "Mortality Statement found", "n-a")); end @@ -1038,16 +1012,11 @@ rule "DOC.44.0: Results (Main Study)" $section: Section( getHeadline().containsString("Results") && getHeadline().getBoundary().length() < 20 - && hasParagraphs() && !(getHeadline().containsString("Appendix") || getHeadline().containsString("Table")) ) then - var paragraphs = $section.streamAllSubNodesOfType(NodeType.PARAGRAPH).toList(); - for(var p : paragraphs){ - entityCreationService.bySemanticNode(p, "results_(main_study)", EntityType.ENTITY).ifPresent(entity -> { - entity.apply("DOC.44.0", "Results for main study found.", "n-a"); - }); - } + entityCreationService.bySemanticNodeParagraphsOnly($section, "results_(main_study)", EntityType.ENTITY) + .forEach(entity -> entity.applyWithLineBreaks("DOC.44.0", "Results for main study found.", "n-a")); end @@ -1056,13 +1025,25 @@ rule "DOC.45.0: Doses (mg/kg bodyweight)" FileAttribute(label == "OECD Number", value == "402") $section: Section( anyHeadlineContainsStringIgnoreCase("study design") - && hasParagraphs() ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "doses_(mg_kg_bw)", EntityType.ENTITY) - .forEach(entity -> entity.apply("DOC.45.0", "Doses per bodyweight information found", "n-a")); + .forEach(entity -> entity.applyWithLineBreaks("DOC.45.0", "Doses per bodyweight information found", "n-a")); end +// This is just an example for new rules feature. +//rule "DOC.99.0" +// when +// Headline(containsStringIgnoreCase("materials and methods"), $sectionIdentifierMaterials: getSectionIdentifier()) +// Headline(containsStringIgnoreCase("controls"), getSectionIdentifier().isChildOf($sectionIdentifierMaterials), $sectionIdentifierControls: getSectionIdentifier()) +// $headline: Headline(containsStringIgnoreCase("positive control substances"), getSectionIdentifier().isChildOf($sectionIdentifierControls)) +// then +// System.out.println($headline); +// entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "irgendwas", EntityType.ENTITY) +// .forEach(entity -> { +// entity.applyWithLineBreaks("DOC.6.0", "positive control substance found", "n-a"); +// }); +// end //------------------------------------ Manual redaction rules ------------------------------------