DM-307: Implemented rule function byRegexWithLinebreaks #23
@ -146,6 +146,15 @@ public class EntityCreationService {
|
||||
.map(boundary -> byBoundary(boundary, type, entityType, node));
|
||||
}
|
||||
|
||||
public Stream<RedactionEntity> byRegexWithLinebreaks(String regexPattern, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
return byRegexWithLinebreaks(regexPattern, type, entityType, 0, node);
|
||||
}
|
||||
|
||||
public Stream<RedactionEntity> byRegexWithLinebreaksIgnoreCase(String regexPattern, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
return byRegexWithLinebreaksIgnoreCase(regexPattern, type, entityType, 0, node);
|
||||
}
|
||||
|
||||
public Stream<RedactionEntity> byRegex(String regexPattern, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
@ -159,6 +168,17 @@ public class EntityCreationService {
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> byRegexWithLinebreaks(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
|
||||
|
||||
return RedactionSearchUtility.findBoundariesByRegexWithLinebreaks(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node));
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> byRegexWithLinebreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
|
||||
|
||||
return RedactionSearchUtility.findBoundariesByRegexWithLinebreaksIgnoreCase(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node));
|
||||
}
|
||||
|
||||
public Stream<RedactionEntity> byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node) {
|
||||
|
||||
return RedactionSearchUtility.findBoundariesByRegex(regexPattern, group, node.getTextBlock()).stream().map(boundary -> byBoundary(boundary, type, entityType, node));
|
||||
|
||||
@ -102,6 +102,20 @@ public class RedactionSearchUtility {
|
||||
}
|
||||
|
||||
|
||||
public static List<Boundary> findBoundariesByRegexWithLinebreaks(String regexPattern, int group, TextBlock textBlock) {
|
||||
|
||||
Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, false);
|
||||
return getBoundariesByPatternWithLinebreaks(textBlock, group, pattern);
|
||||
}
|
||||
|
||||
|
||||
public static List<Boundary> findBoundariesByRegexWithLinebreaksIgnoreCase(String regexPattern, int group, TextBlock textBlock) {
|
||||
|
||||
Pattern pattern = Patterns.getCompiledMultilinePattern(regexPattern, true);
|
||||
return getBoundariesByPatternWithLinebreaks(textBlock, group, pattern);
|
||||
}
|
||||
|
||||
|
||||
public static List<Boundary> findBoundariesByRegexIgnoreCase(String regexPattern, int group, TextBlock textBlock) {
|
||||
|
||||
Pattern pattern = Patterns.getCompiledPattern(regexPattern, true);
|
||||
@ -120,6 +134,29 @@ public class RedactionSearchUtility {
|
||||
}
|
||||
|
||||
|
||||
private static List<Boundary> getBoundariesByPatternWithLinebreaks(TextBlock textBlock, int group, Pattern pattern) {
|
||||
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
textBlock.getAtomicTextBlocks().forEach(at -> {
|
||||
if (at.numberOfLines() > 1) {
|
||||
for (int i = 0; i < at.numberOfLines(); i++) {
|
||||
stringBuilder.append(at.getLine(i));
|
||||
}
|
||||
stringBuilder.setCharAt(stringBuilder.length() - 1, '\n');
|
||||
} else {
|
||||
stringBuilder.append(at.getSearchText()).setCharAt(stringBuilder.length() - 1, '\n');
|
||||
}
|
||||
});
|
||||
|
||||
Matcher matcher = pattern.matcher(stringBuilder.toString());
|
||||
List<Boundary> boundaries = new LinkedList<>();
|
||||
while (matcher.find()) {
|
||||
boundaries.add(new Boundary(matcher.start(group) + textBlock.getBoundary().start(), matcher.end(group) + textBlock.getBoundary().start()));
|
||||
}
|
||||
return boundaries;
|
||||
}
|
||||
|
||||
|
||||
public static List<Boundary> findBoundariesByString(String searchString, TextBlock textBlock) {
|
||||
|
||||
List<Boundary> boundaries = new LinkedList<>();
|
||||
|
||||
@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
||||
@Test
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A8591B/18-Curacron_ToxicidadeOcularInVitro.pdf");
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
System.out.println("Finished structure analysis");
|
||||
|
||||
@ -320,29 +320,35 @@ rule "5: Strain"
|
||||
end
|
||||
|
||||
|
||||
rule "7: study title by document structure"
|
||||
when
|
||||
$table: Table(isOnPage(1),
|
||||
(containsString("Final Report") || containsString("SPL")),
|
||||
numberOfRows == 1,
|
||||
numberOfCols == 1)
|
||||
then
|
||||
|
||||
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||
entity.setRedactionReason("Study title found");
|
||||
entity.setLegalBasis("n-a");
|
||||
entity.setRedaction(true);
|
||||
entity.addMatchedRule("7");
|
||||
});
|
||||
end
|
||||
//rule "7: study title by document structure"
|
||||
// when
|
||||
// $table: Table(isOnPage(1),
|
||||
// (containsString("Final Report") || containsString("SPL")),
|
||||
// numberOfRows == 1,
|
||||
// numberOfCols == 1)
|
||||
// then
|
||||
//
|
||||
// entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||
// entity.setRedactionReason("Study title found");
|
||||
// entity.setLegalBasis("n-a");
|
||||
// entity.setRedaction(true);
|
||||
// entity.addMatchedRule("7");
|
||||
// });
|
||||
// end
|
||||
|
||||
|
||||
rule "7: study title old"
|
||||
when
|
||||
$section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL")))
|
||||
then
|
||||
// TODO
|
||||
// section.redactByRegExWithNewlines("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", true, 0, "title", 7, "Study title found", "n-a");
|
||||
|
||||
entityCreationService.byRegexWithLinebreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section)
|
||||
.forEach(entity -> {
|
||||
entity.setRedactionReason("Title found");
|
||||
entity.setLegalBasis("n-a");
|
||||
entity.setRedaction(true);
|
||||
entity.addMatchedRule("7");
|
||||
});
|
||||
|
||||
entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
entity.setRedactionReason("Title found");
|
||||
@ -447,7 +453,6 @@ rule "8c: Performing Laboratory (Country & Name) from dict"
|
||||
|
||||
|
||||
|
||||
// Headline not found because of ocr.
|
||||
rule "9: GLP Study"
|
||||
when
|
||||
$headline: Headline(containsString("GOOD LABORATORY PRACTICE COMPLIANCE")
|
||||
@ -694,8 +699,8 @@ rule "14: Dosages"
|
||||
FileAttribute(label == "OECD Number", value == "425")
|
||||
$section: Section(
|
||||
(
|
||||
getHeadline().containsString("Dosages")
|
||||
|| getHeadline().containsString("Study Design")
|
||||
anyHeadlineContainsString("Dosages")
|
||||
|| anyHeadlineContainsString("Study Design")
|
||||
)
|
||||
&& !getHeadline().containsString("TABLE")
|
||||
)
|
||||
@ -714,7 +719,13 @@ rule "14: Dosages"
|
||||
entity.addMatchedRule("14");
|
||||
});
|
||||
|
||||
//TODO section.redactByRegExWithNewlines("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])",true, 1, "dosages", 14, "Dosage found", "n-a");
|
||||
entityCreationService.byRegexWithLinebreaks("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])", "dosages", EntityType.ENTITY,1, $section)
|
||||
.forEach(entity -> {
|
||||
entity.setRedactionReason("Dosage found");
|
||||
entity.setLegalBasis("n-a");
|
||||
entity.setRedaction(true);
|
||||
entity.addMatchedRule("14");
|
||||
});
|
||||
end
|
||||
|
||||
rule "15: Mortality"
|
||||
@ -751,7 +762,7 @@ rule "18: Weight Behavior Changes"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "402")
|
||||
$section: Section(
|
||||
getHeadline().containsString("Results")
|
||||
getHeadline().containsStringIgnoreCase("Results")
|
||||
&& (
|
||||
containsString("body weight")
|
||||
|| containsString("body weights")
|
||||
@ -776,8 +787,8 @@ rule "19: Necropsy findings"
|
||||
|| getHeadline().containsString("Macroscopic Findings")
|
||||
|| getHeadline().containsString("Macroscopic examination")
|
||||
)
|
||||
&& !getHeadline().containsString("Table")
|
||||
&& !getHeadline().containsString("Appendix")
|
||||
&& !getHeadline().containsStringIgnoreCase("Table")
|
||||
&& !getHeadline().containsStringIgnoreCase("Appendix")
|
||||
)
|
||||
then
|
||||
var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "necropsy_findings", EntityType.ENTITY, $section);
|
||||
@ -793,13 +804,13 @@ rule "22: Clinical observations"
|
||||
FileAttribute(label == "OECD Number", value == "403")
|
||||
$section: Section(
|
||||
(
|
||||
getHeadline().containsString("Clinical Observations")
|
||||
|| getHeadline().containsString("Clinical observations")
|
||||
|| getHeadline().containsString("In-life Observations")
|
||||
|| getHeadline().containsString("Postmortem Observations")
|
||||
anyHeadlineContainsStringIgnoreCase("Clinical Observations")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Clinical observations")
|
||||
|| anyHeadlineContainsStringIgnoreCase("In-life Observations")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Postmortem Observations")
|
||||
)
|
||||
&& !getHeadline().containsString("Appendix")
|
||||
&& !getHeadline().containsString("Table")
|
||||
&& !anyHeadlineContainsStringIgnoreCase("Appendix")
|
||||
&& !anyHeadlineContainsStringIgnoreCase("Table")
|
||||
)
|
||||
then
|
||||
|
||||
@ -872,8 +883,8 @@ rule "23: Bodyweight changes"
|
||||
|| getHeadline().containsString("Body Weights")
|
||||
|| getHeadline().containsString("Body Weight")
|
||||
)
|
||||
&& !getHeadline().containsString("Appendix")
|
||||
&& !getHeadline().containsString("TABLE")
|
||||
&& !getHeadline().containsStringIgnoreCase("Appendix")
|
||||
&& !getHeadline().containsStringIgnoreCase("TABLE")
|
||||
&& hasParagraphs()
|
||||
)
|
||||
then
|
||||
@ -889,7 +900,7 @@ rule "24: Study Design"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
|
||||
$section: Section(
|
||||
getHeadline().containsString("study design")
|
||||
anyHeadlineContainsStringIgnoreCase("study design")
|
||||
)
|
||||
then
|
||||
var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "study_design", EntityType.ENTITY, $section);
|
||||
@ -924,7 +935,7 @@ rule "26: Detailing (404 & 405)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405"))
|
||||
$section: Section(
|
||||
getHeadline().containsString("Results") && !getHeadline().containsString("Evaluation") && !getHeadline().containsString("study")
|
||||
getHeadline().containsStringIgnoreCase("Results") && !getHeadline().containsStringIgnoreCase("Evaluation") && !getHeadline().containsStringIgnoreCase("study") && hasParagraphs()
|
||||
)
|
||||
then
|
||||
var entity = entityCreationService.byBoundary(Boundary.merge($section.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBoundary).toList()), "detailing", EntityType.ENTITY, $section);
|
||||
@ -991,10 +1002,10 @@ rule "35: Sex"
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("405","429"))
|
||||
$section: Section(
|
||||
(
|
||||
getHeadline().containsString("animal")
|
||||
|| getHeadline().containsString("test system")
|
||||
getHeadline().containsStringIgnoreCase("animal")
|
||||
|| getHeadline().containsStringIgnoreCase("test system")
|
||||
)
|
||||
&& !getHeadline().containsString("selection")
|
||||
&& !getHeadline().containsStringIgnoreCase("selection")
|
||||
&& (
|
||||
containsStringIgnoreCase("sex:")
|
||||
|| containsStringIgnoreCase("male")
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user