diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java index fe931572..b8bc4fd1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java @@ -146,6 +146,15 @@ public class EntityCreationService { .map(boundary -> byBoundary(boundary, type, entityType, node)); } + public Stream byRegexWithLinebreaks(String regexPattern, String type, EntityType entityType, SemanticNode node) { + + return byRegexWithLinebreaks(regexPattern, type, entityType, 0, node); + } + + public Stream byRegexWithLinebreaksIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) { + + return byRegexWithLinebreaksIgnoreCase(regexPattern, type, entityType, 0, node); + } public Stream byRegex(String regexPattern, String type, EntityType entityType, SemanticNode node) { @@ -159,6 +168,17 @@ public class EntityCreationService { } + public Stream byRegexWithLinebreaks(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) { + + return RedactionSearchUtility.findBoundariesByRegexWithLinebreaks(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node)); + } + + + public Stream byRegexWithLinebreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) { + + return RedactionSearchUtility.findBoundariesByRegexWithLinebreaksIgnoreCase(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node)); + } + public Stream byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) { return RedactionSearchUtility.findBoundariesByRegex(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node)); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RedactionSearchUtility.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RedactionSearchUtility.java index f01ce61a..123640e2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RedactionSearchUtility.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RedactionSearchUtility.java @@ -102,6 +102,20 @@ public class RedactionSearchUtility { } + public static List findBoundariesByRegexWithLinebreaks(String regexPattern, int group, TextBlock textBlock) { + + Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, false); + return getBoundariesByPatternWithLinebreaks(textBlock, group, pattern); + } + + + public static List findBoundariesByRegexWithLinebreaksIgnoreCase(String regexPattern, int group, TextBlock textBlock) { + + Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, true); + return getBoundariesByPatternWithLinebreaks(textBlock, group, pattern); + } + + public static List findBoundariesByRegexIgnoreCase(String regexPattern, int group, TextBlock textBlock) { Pattern pattern = Patterns.getCompiledPattern(regexPattern, true); @@ -120,6 +134,29 @@ public class RedactionSearchUtility { } + private static List getBoundariesByPatternWithLinebreaks(TextBlock textBlock, int group, Pattern pattern) { + + StringBuilder stringBuilder = new StringBuilder(); + textBlock.getAtomicTextBlocks().forEach(at -> { + if (at.numberOfLines() > 1) { + for (int i = 0; i < at.numberOfLines(); i++) { + stringBuilder.append(at.getLine(i)); + } + stringBuilder.setCharAt(stringBuilder.length() - 1, '\n'); + } else { + stringBuilder.append(at.getSearchText()).setCharAt(stringBuilder.length() - 1, '\n'); + } + }); + + Matcher matcher = pattern.matcher(stringBuilder.toString()); + List boundaries = new LinkedList<>(); + while (matcher.find()) { + boundaries.add(new Boundary(matcher.start(group) + textBlock.getBoundary().start(), matcher.end(group) + textBlock.getBoundary().start())); + } + return boundaries; + } + + public static List findBoundariesByString(String searchString, TextBlock textBlock) { List boundaries = new LinkedList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index 60ea3603..5be9cb6f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { @Test public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVitro.pdf"); System.out.println("Start Full integration test"); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); System.out.println("Finished structure analysis"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl index 8ff4e0b3..621fcec0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl @@ -320,29 +320,35 @@ rule "5: Strain" end -rule "7: study title by document structure" - when - $table: Table(isOnPage(1), - (containsString("Final Report") || containsString("SPL")), - numberOfRows == 1, - numberOfCols == 1) - then - - entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> { - entity.setRedactionReason("Study title found"); - entity.setLegalBasis("n-a"); - entity.setRedaction(true); - entity.addMatchedRule("7"); - }); - end +//rule "7: study title by document structure" +// when +// $table: Table(isOnPage(1), +// (containsString("Final Report") || containsString("SPL")), +// numberOfRows == 1, +// numberOfCols == 1) +// then +// +// entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> { +// entity.setRedactionReason("Study title found"); +// entity.setLegalBasis("n-a"); +// entity.setRedaction(true); +// entity.addMatchedRule("7"); +// }); +// end rule "7: study title old" when $section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL"))) then -// TODO -// section.redactByRegExWithNewlines("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", true, 0, "title", 7, "Study title found", "n-a"); + + entityCreationService.byRegexWithLinebreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section) + .forEach(entity -> { + entity.setRedactionReason("Title found"); + entity.setLegalBasis("n-a"); + entity.setRedaction(true); + entity.addMatchedRule("7"); + }); entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> { entity.setRedactionReason("Title found"); @@ -447,7 +453,6 @@ rule "8c: Performing Laboratory (Country & Name) from dict" -// Headline not found because of ocr. rule "9: GLP Study" when $headline: Headline(containsString("GOOD LABORATORY PRACTICE COMPLIANCE") @@ -694,8 +699,8 @@ rule "14: Dosages" FileAttribute(label == "OECD Number", value == "425") $section: Section( ( - getHeadline().containsString("Dosages") - || getHeadline().containsString("Study Design") + anyHeadlineContainsString("Dosages") + || anyHeadlineContainsString("Study Design") ) && !getHeadline().containsString("TABLE") ) @@ -714,7 +719,13 @@ rule "14: Dosages" entity.addMatchedRule("14"); }); -//TODO section.redactByRegExWithNewlines("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])",true, 1, "dosages", 14, "Dosage found", "n-a"); + entityCreationService.byRegexWithLinebreaks("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])", "dosages", EntityType.ENTITY,1, $section) + .forEach(entity -> { + entity.setRedactionReason("Dosage found"); + entity.setLegalBasis("n-a"); + entity.setRedaction(true); + entity.addMatchedRule("14"); +}); end rule "15: Mortality" @@ -751,7 +762,7 @@ rule "18: Weight Behavior Changes" when FileAttribute(label == "OECD Number", value == "402") $section: Section( - getHeadline().containsString("Results") + getHeadline().containsStringIgnoreCase("Results") && ( containsString("body weight") || containsString("body weights") @@ -776,8 +787,8 @@ rule "19: Necropsy findings" || getHeadline().containsString("Macroscopic Findings") || getHeadline().containsString("Macroscopic examination") ) - && !getHeadline().containsString("Table") - && !getHeadline().containsString("Appendix") + && !getHeadline().containsStringIgnoreCase("Table") + && !getHeadline().containsStringIgnoreCase("Appendix") ) then var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "necropsy_findings", EntityType.ENTITY, $section); @@ -793,13 +804,13 @@ rule "22: Clinical observations" FileAttribute(label == "OECD Number", value == "403") $section: Section( ( - getHeadline().containsString("Clinical Observations") - || getHeadline().containsString("Clinical observations") - || getHeadline().containsString("In-life Observations") - || getHeadline().containsString("Postmortem Observations") + anyHeadlineContainsStringIgnoreCase("Clinical Observations") + || anyHeadlineContainsStringIgnoreCase("Clinical observations") + || anyHeadlineContainsStringIgnoreCase("In-life Observations") + || anyHeadlineContainsStringIgnoreCase("Postmortem Observations") ) - && !getHeadline().containsString("Appendix") - && !getHeadline().containsString("Table") + && !anyHeadlineContainsStringIgnoreCase("Appendix") + && !anyHeadlineContainsStringIgnoreCase("Table") ) then @@ -872,8 +883,8 @@ rule "23: Bodyweight changes" || getHeadline().containsString("Body Weights") || getHeadline().containsString("Body Weight") ) - && !getHeadline().containsString("Appendix") - && !getHeadline().containsString("TABLE") + && !getHeadline().containsStringIgnoreCase("Appendix") + && !getHeadline().containsStringIgnoreCase("TABLE") && hasParagraphs() ) then @@ -889,7 +900,7 @@ rule "24: Study Design" when FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487")) $section: Section( - getHeadline().containsString("study design") + anyHeadlineContainsStringIgnoreCase("study design") ) then var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "study_design", EntityType.ENTITY, $section); @@ -924,7 +935,7 @@ rule "26: Detailing (404 & 405)" when FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405")) $section: Section( - getHeadline().containsString("Results") && !getHeadline().containsString("Evaluation") && !getHeadline().containsString("study") + getHeadline().containsStringIgnoreCase("Results") && !getHeadline().containsStringIgnoreCase("Evaluation") && !getHeadline().containsStringIgnoreCase("study") && hasParagraphs() ) then var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "detailing", EntityType.ENTITY, $section); @@ -991,10 +1002,10 @@ rule "35: Sex" FileAttribute(label == "OECD Number", valueEqualsAnyOf("405","429")) $section: Section( ( - getHeadline().containsString("animal") - || getHeadline().containsString("test system") + getHeadline().containsStringIgnoreCase("animal") + || getHeadline().containsStringIgnoreCase("test system") ) - && !getHeadline().containsString("selection") + && !getHeadline().containsStringIgnoreCase("selection") && ( containsStringIgnoreCase("sex:") || containsStringIgnoreCase("male") diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/15-Curacron_ToxicidadeAgudaOral.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/15-Curacron_ToxicidadeAgudaOral.pdf new file mode 100644 index 00000000..7f3ae7b9 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/15-Curacron_ToxicidadeAgudaOral.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/16-Curacron_ToxicidadeDermica.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/16-Curacron_ToxicidadeDermica.pdf new file mode 100644 index 00000000..ff8bdfda Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/16-Curacron_ToxicidadeDermica.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/17-Curacron_ToxicidadeInalatoria.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/17-Curacron_ToxicidadeInalatoria.pdf new file mode 100644 index 00000000..cd8482a2 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/17-Curacron_ToxicidadeInalatoria.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVitro.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVitro.pdf new file mode 100644 index 00000000..069b5cfe Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVitro.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVivo.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVivo.pdf new file mode 100644 index 00000000..5d6959ca Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVivo.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/19-Curacron_IrritacaoCutaneaAguda.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/19-Curacron_IrritacaoCutaneaAguda.pdf new file mode 100644 index 00000000..abe3e2cf Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/19-Curacron_IrritacaoCutaneaAguda.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/20_Curacron_SensibilizacaoCutanea.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/20_Curacron_SensibilizacaoCutanea.pdf new file mode 100644 index 00000000..dfb63923 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/20_Curacron_SensibilizacaoCutanea.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/21-Curacron_MutacaoGenicaEmCelulasBacterianas.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/21-Curacron_MutacaoGenicaEmCelulasBacterianas.pdf new file mode 100644 index 00000000..2c1a4077 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/21-Curacron_MutacaoGenicaEmCelulasBacterianas.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/22-Curacron_DanoCromossomicoInVitro.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/22-Curacron_DanoCromossomicoInVitro.pdf new file mode 100644 index 00000000..5e8a1956 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/A8591B/22-Curacron_DanoCromossomicoInVitro.pdf differ