DM-307: Implement across column function

This commit is contained in:
Dominique Eifländer 2023-07-14 10:57:14 +02:00
parent 45fe200521
commit 6565fa1446
11 changed files with 394 additions and 71 deletions

View File

@ -122,7 +122,7 @@ public class TableNodeFactory {
textBlock = context.getTextBlockFactory().fromContext(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, List.of(tb)));
}
}

View File

@ -25,6 +25,7 @@ import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@ -44,6 +45,7 @@ public class AtomicTextBlock implements TextBlock {
//position coordinates
List<Integer> stringIdxToPositionIdx;
@Getter
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
@ -119,17 +121,20 @@ public class AtomicTextBlock implements TextBlock {
}
public CharSequence getLine(int lineNumber) {
public Boundary getLineBoundary(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
return new Boundary(boundary.start(), boundary.start());
}
if (numberOfLines() == 1) {
return boundary;
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
return new Boundary(boundary.start(), lineBreaks.get(0) + boundary.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
return new Boundary(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
return new Boundary(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
}
@ -209,10 +214,7 @@ public class AtomicTextBlock implements TextBlock {
return "";
}
Set<Integer> lbInBoundary = lineBreaks.stream()
.map(i -> i+ boundary.start())
.filter(boundary::contains)
.collect(Collectors.toSet());
Set<Integer> lbInBoundary = lineBreaks.stream().map(i -> i + boundary.start()).filter(boundary::contains).collect(Collectors.toSet());
if (boundary.end() == getBoundary().end()) {
lbInBoundary.add(getBoundary().end());
}

View File

@ -89,7 +89,7 @@ public class ConcatenatedTextBlock implements TextBlock {
@Override
public int numberOfLines() {
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
return atomicTextBlocks.stream().mapToInt(AtomicTextBlock::numberOfLines).sum();
}
@ -114,18 +114,37 @@ public class ConcatenatedTextBlock implements TextBlock {
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
public Boundary getLineBoundary(int lineNumber) {
if (atomicTextBlocks.size() == 1) {
return atomicTextBlocks.get(0).getLineBoundary(lineNumber);
}
int lineNumberInCurrentBlock = lineNumber;
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
if (lineNumberInCurrentBlock < atomicTextBlock.numberOfLines()) {
return atomicTextBlock.getLineBoundary(lineNumberInCurrentBlock);
}
lineNumberInCurrentBlock -= atomicTextBlock.numberOfLines();
}
return new Boundary(boundary.start(), boundary.start());
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.isEmpty()) {
return Collections.emptyList();
}
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringBoundary);
}

View File

@ -12,6 +12,7 @@ import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
public interface TextBlock extends CharSequence {
@ -30,6 +31,10 @@ public interface TextBlock extends CharSequence {
int getPreviousLinebreak(int fromIndex);
Boundary getLineBoundary(int lineNumber);
List<Integer> getLineBreaks();
@ -48,6 +53,23 @@ public interface TextBlock extends CharSequence {
int numberOfLines();
default CharSequence getLine(int lineNumber) {
return subSequence(getLineBoundary(lineNumber));
}
default List<Rectangle2D> getLinePositions(int lineNumber) {
return getPositions(getLineBoundary(lineNumber));
}
default Rectangle2D getLineBBox(int lineNumber) {
return RectangleTransformations.rectangle2DBBox(getLinePositions(lineNumber));
}
default String searchTextWithLineBreaks() {
return subSequenceWithLineBreaks(getBoundary());

View File

@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.layoutparsing.document.service
import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RedactionSearchUtility.getExpandedEndByRegex;
import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RedactionSearchUtility.getExpandedStartByRegex;
import static com.iqser.red.service.redaction.v1.server.redaction.utils.SeparatorUtils.boundaryIsSurroundedBySeparators;
import static java.util.stream.Collectors.toMap;
import java.util.Collection;
import java.util.Collections;
@ -15,8 +16,10 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.tuple.Pair;
import org.kie.api.runtime.KieSession;
import com.google.common.base.Functions;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
@ -26,8 +29,10 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.en
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RedactionSearchUtility;
import com.iqser.red.service.redaction.v1.server.redaction.adapter.NerEntities;
import com.iqser.red.service.redaction.v1.server.redaction.adapter.NerEntitiesAdapter;
@ -61,7 +66,72 @@ public class EntityCreationService {
}
public Stream<RedactionEntity> betweenStringsInclusive(String start, String stop, String type, EntityType entityType, SemanticNode node) {
public Stream<RedactionEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
}
public Stream<RedactionEntity> betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByString(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByString(stop, node.getTextBlock());
startBoundaries.forEach(boundary -> {
boundary.setStart(boundary.start() - start.length());
boundary.setEnd(boundary.end() - start.length());
});
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
}
public Stream<RedactionEntity> betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
startBoundaries.forEach(boundary -> {
boundary.setStart(boundary.start() - start.length());
boundary.setEnd(boundary.end() - start.length());
});
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
}
public Stream<RedactionEntity> betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByString(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByString(stop, node.getTextBlock());
stopBoundaries.forEach(boundary -> {
boundary.setStart(boundary.start() + stop.length());
boundary.setEnd(boundary.end() + stop.length());
});
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
}
public Stream<RedactionEntity> betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
stopBoundaries.forEach(boundary -> {
boundary.setStart(boundary.start() + stop.length());
boundary.setEnd(boundary.end() + stop.length());
});
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
}
public Stream<RedactionEntity> betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByString(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByString(stop, node.getTextBlock());
@ -79,7 +149,10 @@ public class EntityCreationService {
}
public Stream<RedactionEntity> betweenStringsInclusiveIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
public Stream<RedactionEntity> betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
@ -97,13 +170,7 @@ public class EntityCreationService {
}
public Stream<RedactionEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
}
public Stream<RedactionEntity> betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) {
@ -209,6 +276,55 @@ public class EntityCreationService {
}
public Stream<RedactionEntity> lineAfterStringAcrossColumns(String string, String type, EntityType entityType, Table tableNode) {
return tableNode.streamTableCells()
.flatMap(tableCell -> lineAfterBoundariesAcrossColumns(RedactionSearchUtility.findBoundariesByString(string, tableCell.getTextBlock()),
tableCell,
type,
entityType,
tableNode));
}
public Stream<RedactionEntity> lineAfterStringAcrossColumnsIgnoreCase(String string, String type, EntityType entityType, Table tableNode) {
return tableNode.streamTableCells()
.flatMap(tableCell -> lineAfterBoundariesAcrossColumns(RedactionSearchUtility.findBoundariesByStringIgnoreCase(string, tableCell.getTextBlock()),
tableCell,
type,
entityType,
tableNode));
}
/**
* Looks across the remaining table row to the right of the provided TableCell if any line intersects the y coordinates of the found text.
*
* @param boundaries a list of boundaries
* @param tableCell the table cell
* @param type the type
* @param entityType the entity type
* @param tableNode the table node
* @return a stream of RedactionEntities
*/
private Stream<RedactionEntity> lineAfterBoundariesAcrossColumns(List<Boundary> boundaries, TableCell tableCell, String type, EntityType entityType, Table tableNode) {
return boundaries.stream()
.map(boundary -> RectangleTransformations.rectangle2DBBox(tableCell.getTextBlock().getPositions(boundary)))
.map(bBox -> Pair.of(bBox.getMaxY(), bBox.getMinY()))
.map(maxMinPair -> tableNode.streamRow(tableCell.getRow())
.filter(nextTableCell -> nextTableCell.getCol() > tableCell.getCol())
.map(nextTableCell -> RedactionSearchUtility.findBoundaryOfAllLinesInYRange(maxMinPair.getLeft(), maxMinPair.getRight(), nextTableCell.getTextBlock()))
.map(b -> b.trim(tableNode.getTextBlock()))
.filter(boundary -> isValidEntityBoundary(tableNode.getTextBlock(), boundary))
.map(boundary -> byBoundary(boundary, type, entityType, tableNode))
.filter(Optional::isPresent)
.map(Optional::get))
.flatMap(Functions.identity());
}
public Optional<RedactionEntity> semanticNodeAfterString(SemanticNode semanticNode, String string, String type, EntityType entityType) {
var textBlock = semanticNode.getTextBlock();

View File

@ -2,11 +2,13 @@ package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
@ -86,6 +88,22 @@ public class RedactionSearchUtility {
return expandedStart;
}
public static Boundary findBoundaryOfAllLinesInYRange(double maxY, double minY, TextBlock textBlock) {
List<Boundary> lineBoundaries = IntStream.range(0, textBlock.numberOfLines()).boxed().map(textBlock::getLineBoundary).filter(lineBoundary -> isWithinYRange(maxY, minY, textBlock, lineBoundary)).toList();
if (lineBoundaries.isEmpty()) {
return new Boundary(textBlock.getBoundary().start(), textBlock.getBoundary().start());
}
return Boundary.merge(lineBoundaries);
}
private static boolean isWithinYRange(double maxY, double minY, TextBlock textBlock, Boundary lineBoundary) {
Rectangle2D lineBBox = RectangleTransformations.rectangle2DBBox(textBlock.getPositions(lineBoundary));
return lineBBox.getMinY() < maxY && minY < lineBBox.getMaxY();
}
public static List<Boundary> findBoundariesByRegex(String regexPattern, TextBlock textBlock) {
@ -158,8 +176,8 @@ public class RedactionSearchUtility {
public static List<Boundary> findBoundariesByStringIgnoreCase(String searchString, TextBlock textBlock) {
String searchStringLowerCase = searchString.toLowerCase(Locale.ROOT);
return findBoundariesByString(searchStringLowerCase, textBlock);
Pattern pattern = Pattern.compile(Pattern.quote(searchString), Pattern.CASE_INSENSITIVE);
return getBoundariesByPattern(textBlock, 0, pattern);
}
}

View File

@ -46,8 +46,8 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
public void titleExtraction() throws IOException {
AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/403-17_Fantom_ToxicidadeInalatoriaAguda.pdf",
"files/Documine/Flora/ProblemDocs/d75cd9358f7949552697764428183472.TABLES.json");
AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
"files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));

View File

@ -315,36 +315,36 @@ rule "DOC.5.0: Strain"
end
//rule "DOC.7.0: study title by document structure"
// when
// $table: Table(isOnPage(1),
// (containsString("Final Report") || containsString("SPL")),
// numberOfRows == 1,
// numberOfCols == 1)
// then
//
// entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
// entity.apply("DOC.7.0", "Study title found", "n-a");
// });
// end
rule "DOC.7.0: study title"
rule "DOC.7.0: study title by document structure"
when
$section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL")))
$table: Table(isOnPage(1),
(containsString("Final Report") || containsString("SPL")),
numberOfRows == 1,
numberOfCols == 1)
then
entityCreationService.byRegexWithLineBreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
entity.apply("DOC.7.0", "Title found", "n-a");
});
entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
entity.apply("DOC.7.0", "Title found", "n-a");
});
entityCreationService.betweenStrings("Laboratories", "SPL", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
entity.apply("DOC.7.0", "Title found", "n-a");
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
entity.apply("DOC.7.0", "Study title found", "n-a");
});
end
//rule "DOC.7.0: study title"
// when
// $section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL")))
// then
// entityCreationService.byRegexWithLineBreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
// entity.apply("DOC.7.0", "Title found", "n-a");
// });
// entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
// entity.apply("DOC.7.0", "Title found", "n-a");
// });
// entityCreationService.betweenStrings("Laboratories", "SPL", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
// entity.apply("DOC.7.0", "Title found", "n-a");
// });
// end
rule "DOC.8.1: Performing Laboratory (Name)"
when
$section: Section(containsString("PERFORMING LABORATORY:"))
@ -422,8 +422,8 @@ rule "DOC.10.0: Batch number from CoA"
$section: Section(
(
anyHeadlineContainsString("Analytical Report")
|| anyHeadlineContainsString("Certificate of Analysis")
|| containsStringIgnoreCase("certificate of analysis")
|| anyHeadlineContainsStringIgnoreCase("Certificate of Analysis")
|| containsStringIgnoreCase("Certificate of Analysis")
)
&& (
containsStringIgnoreCase("batch")
@ -460,16 +460,16 @@ rule "DOC.10.1: Batch number"
when
$section: Section(
(
anyHeadlineContainsString("Test Substance")
|| getHeadline().containsString("Test and Control Substances")
|| getHeadline().containsString("Test Substances")
|| getHeadline().containsString("Test Substance")
|| getHeadline().containsString("Test Item")
anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test Item")
)
&& !(
getHeadline().containsString("component")
|| getHeadline().containsString("reference")
|| getHeadline().containsString("blank")
anyHeadlineContainsString("component")
|| anyHeadlineContainsString("reference")
|| anyHeadlineContainsString("blank")
)
&& containsStringIgnoreCase("batch")
)
@ -491,6 +491,35 @@ rule "DOC.10.1: Batch number"
end
rule "DOC.10.2: Batch number"
when
$section: Section(
(
anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test Item")
)
&& !(
anyHeadlineContainsString("component")
|| anyHeadlineContainsString("reference")
|| anyHeadlineContainsString("blank")
)
&& containsStringIgnoreCase("batch")
)
$table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
then
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entity.apply("DOC.10.2", "Batch number found", "n-a");
});
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entity.apply("DOC.10.2", "Batch number found", "n-a");
});
end
rule "DOC.11.0: Conclusions - LD50, LC50, Confidence"
when
@ -529,7 +558,7 @@ rule "DOC.12.0: Guideline Deviation"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436","471"))
$section: Section(
(getHeadline().containsString("General Information") || containsString("GENERAL INFORMATION"))
(getHeadline().containsStringIgnoreCase("General Information") || containsString("GENERAL INFORMATION"))
&& (containsStringIgnoreCase("from the") || containsStringIgnoreCase("to the"))
)
then
@ -542,7 +571,7 @@ rule "DOC.12.0: Guideline Deviation"
entityCreationService.betweenStrings("Deviations from the study plan", "Regulatory Guidelines", "guideline_deviation", EntityType.ENTITY, $section).forEach(entity -> {
entity.apply("DOC.12.0", "Deviation from the study plan found", "n-a");
});
entityCreationService.byRegex("(?>Study plan adherence)(.{1,20}deviations.{1,20} to the study plan.{0,50}\\.)\\s", "guideline_deviation", EntityType.ENTITY, 1, $section).forEach(entity -> {
entityCreationService.byRegexIgnoreCase("(?>Study plan adherence)(.{1,20}deviations.{1,20} to the study plan.{0,50}\\.)\\s", "guideline_deviation", EntityType.ENTITY, 1, $section).forEach(entity -> {
entity.apply("DOC.12.0", "Guideline deviation found in text.", "n-a");
});
entityCreationService.betweenStrings("Deviations from the study plan", "validity of the study.", "guideline_deviation", EntityType.ENTITY, $section).forEach(entity -> {
@ -579,14 +608,14 @@ rule "DOC.14.0: Dosages"
when
FileAttribute(label == "OECD Number", value == "425")
$section: Section(
(anyHeadlineContainsString("Dosages") || anyHeadlineContainsString("Study Design"))
(anyHeadlineContainsStringIgnoreCase("Dosages") || anyHeadlineContainsStringIgnoreCase("Study Design"))
&& !getHeadline().containsString("TABLE")
)
then
entityCreationService.betweenStringsInclusive("The animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
entityCreationService.betweenStringsIncludeStartAndEnd("The animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
entity.apply("DOC.14.0", "Dosage found", "n-a");
});
entityCreationService.betweenStringsInclusive("Animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
entityCreationService.betweenStringsIncludeStartAndEnd("Animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
entity.apply("DOC.14.0", "Dosage found", "n-a");
});
entityCreationService.byRegexWithLineBreaks("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])", "dosages", EntityType.ENTITY,1, $section).forEach(entity -> {
@ -615,7 +644,7 @@ rule "DOC.17.0: Study Conclusion"
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.17.0", "Study Conclusion found", "n-a"));
end
/*
rule "DOC.18.0: Weight Behavior Changes"
when
@ -633,8 +662,84 @@ rule "DOC.18.0: Weight Behavior Changes"
entityCreationService.bySemanticNodeParagraphsOnly($section, "weight_behavior_changes", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.18.0", "Weight behavior changes found", "n-a"));
end
*/
rule "DOC.18.0: Weight Behavior Changes"
when
FileAttribute(label == "OECD Number", value == "402")
$section: Section(
getHeadline().containsString("Results")
&& (
containsString("body weight")
|| containsString("body weights")
|| containsString("bodyweight")
|| containsString("bodyweights")
)
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "weight_behavior_changes", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.18.0", "Weight behavior changes found", "n-a"));
end
rule "DOC.19.0: Necropsy findings"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","436"))
$section: Section(
(
anyHeadlineContainsStringIgnoreCase("Necropsy")
|| getHeadline().containsStringIgnoreCase("Macroscopic Findings")
|| getHeadline().containsStringIgnoreCase("Macroscopic examination")
)
&& !getHeadline().containsStringIgnoreCase("Table")
&& !getHeadline().containsStringIgnoreCase("Appendix")
&& !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS")
//&& !containsString("3 - MACROSCOPIC FINDINGS")
//&& !anyHeadlineContainsString("3 - MACROSCOPIC FINDINGS")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "necropsy_findings", EntityType.ENTITY)
.forEach( entity -> entity.apply("DOC.19.0", "Necropsy section found", "n-a"));
end
/*
rule "DOC.19.0: Necropsy findings"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","436"))
$section: Section(
(
anyHeadlineContainsStringIgnoreCase("Necropsy")
|| getHeadline().containsStringIgnoreCase("Macroscopic Findings")
|| getHeadline().containsStringIgnoreCase("Macroscopic examination")
)
&& !getHeadline().containsStringIgnoreCase("Table")
&& !getHeadline().containsStringIgnoreCase("Appendix")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "necropsy_findings", EntityType.ENTITY)
.forEach( entity -> entity.apply("DOC.19.0", "Necropsy section found", "n-a"));
end
*/
rule "DOC.22.0: Clinical observations"
when
FileAttribute(label == "OECD Number", value == "403")
$section: Section(
(
anyHeadlineContainsStringIgnoreCase("Clinical Observations")
|| anyHeadlineContainsStringIgnoreCase("Clinical observations")
|| anyHeadlineContainsStringIgnoreCase("In-life Observations")
|| anyHeadlineContainsStringIgnoreCase("Postmortem Observations")
)
&& !anyHeadlineContainsStringIgnoreCase("Appendix")
&& !anyHeadlineContainsStringIgnoreCase("Table")
&& !anyHeadlineContainsStringIgnoreCase("Mortality")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_observations", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.22.0", "Clinical observations section found", "n-a"));
end
/*
rule "DOC.19.0: Necropsy findings"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","436"))
@ -670,7 +775,7 @@ rule "DOC.22.0: Clinical observations"
entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_observations", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.22.0", "Clinical observations section found", "n-a"));
end
*/
/* Die beiden waren vorher auch auskommentiert
rule "DOC.23.1: Bodyweight changes"
@ -723,19 +828,33 @@ rule "DOC.24.0: Study Design"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
$section: Section(
//retry this with only getHeadline().containsStringIgnoreCase("study design")
anyHeadlineContainsStringIgnoreCase("study design")
&& !anyHeadlineContainsString("Preliminary screening test")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_design", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.24.0", "Study design section found", "n-a"));
end
/*
rule "DOC.24.0: Study Design"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
$section: Section(
anyHeadlineContainsStringIgnoreCase("study design")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_design", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.24.0", "Study design section found", "n-a"));
end
*/
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
$section: Section(
(getHeadline().containsString("Results") || getHeadline().containsString("Conclusion"))
(getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion"))
&& !getHeadline().containsString("POSITIVE CONTROL")
&& !getHeadline().containsString("Positive Control")
&& !getHeadline().containsString("Evaluation")
@ -752,7 +871,22 @@ rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
end
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
rule "DOC.26.0: Detailing (404 & 405)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405"))
$section: Section(
anyHeadlineContainsStringIgnoreCase("Results")
&& !getHeadline().containsStringIgnoreCase("Evaluation")
&& !getHeadline().containsStringIgnoreCase("study")
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "detailing", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.26.0", "Detailing found", "n-a"));
end
/*
rule "DOC.26.0: Detailing (404 & 405)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405"))
@ -766,13 +900,13 @@ rule "DOC.26.0: Detailing (404 & 405)"
entityCreationService.bySemanticNodeParagraphsOnly($section, "detailing", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.26.0", "Detailing found", "n-a"));
end
*/
rule "DOC.32.0: Preliminary Test Results (429)"
when
FileAttribute(label == "OECD Number", value == "429")
$section: Section(
((getHeadline().containsString("Preliminary Screening Test") && containsString("Clinical observations"))
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|| getHeadline().containsString("Pre-Experiment"))
)
then
@ -942,18 +1076,28 @@ rule "DOC.40.0: Positive Control"
rule "DOC.42.0: Mortality Statement"
when
FileAttribute(label == "OECD Number", value == "402")
$headline: Headline(containsString("Mortality") && !containsString("TABLE"))
$headline: Headline(containsStringIgnoreCase("Mortality") && !containsString("TABLE"))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality_statement", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.42.0", "Mortality Statement found", "n-a"));
end
/*
rule "DOC.42.0: Mortality Statement"
when
FileAttribute(label == "OECD Number", value == "402")
$headline: Headline(containsString("Mortality") && !containsString("TABLE"))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality_statement", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.42.0", "Mortality Statement found", "n-a"));
end
*/
rule "DOC.43.0: Dose Mortality"
when
FileAttribute(label == "OECD Number", value == "425")
$table: Table(
(hasHeader("Mortality") || hasHeader("Long Term Results") || hasHeader("Long Term Outcome") || hasHeader("Comments") || hasHeader("Viability / Mortality") || hasHeader("Viability/Mortality"))
(hasHeader("Mortality") || hasHeader("Long Term Results") || hasHeader("LongTerm Outcome") || hasHeader("Long Term Outcome") || hasHeader("Comments") || hasHeader("Viability / Mortality") || hasHeader("Viability/Mortality"))
&&
(hasHeader("Dose [mg/kg bodyweight]") || hasHeader("Dose [mg/kg body weight]") ||hasHeader("Dose (mg/kg)") || hasHeader("Dose levei (mg/kg)") || hasHeader("Dose Level (mg/kg)") || hasHeader("Dose level (mg/kg)") || hasHeader("Dosage [mg/kg body weight]"))
)
@ -962,6 +1106,7 @@ rule "DOC.43.0: Dose Mortality"
$table.streamTableCellsWithHeader("Comments"),
$table.streamTableCellsWithHeader("Long Term Results"),
$table.streamTableCellsWithHeader("Long Term Outcome"),
$table.streamTableCellsWithHeader("LongTerm Outcome"),
$table.streamTableCellsWithHeader("Viability / Mortality"),
$table.streamTableCellsWithHeader("Viability/Mortality")
).flatMap(a -> a)
@ -1110,6 +1255,7 @@ rule "X.0.0: remove Entity contained by Entity of same type"
retract($contained);
end
// Rule unit: X.7
rule "X.7.0: remove all images"
salience 512
@ -1120,6 +1266,7 @@ rule "X.7.0: remove all images"
retract($image);
end
//------------------------------------ File attributes rules ------------------------------------
// Rule unit: FA.1
@ -1134,8 +1281,6 @@ rule "FA.1.0: remove duplicate FileAttributes"
end
//------------------------------------ Local dictionary search rules ------------------------------------
// Rule unit: LDS.0
rule "LDS.0.0: run local dictionary search"
agenda-group "LOCAL_DICTIONARY_ADDS"