DM-307: Implement across column function
This commit is contained in:
parent
45fe200521
commit
6565fa1446
@ -122,7 +122,7 @@ public class TableNodeFactory {
|
||||
textBlock = context.getTextBlockFactory().fromContext(sequences, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, List.of(tb)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@ -44,6 +45,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
//position coordinates
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
@Getter
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
@ -119,17 +121,20 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
public CharSequence getLine(int lineNumber) {
|
||||
public Boundary getLineBoundary(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
return new Boundary(boundary.start(), boundary.start());
|
||||
}
|
||||
if (numberOfLines() == 1) {
|
||||
return boundary;
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
return new Boundary(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
return new Boundary(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
}
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
return new Boundary(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
}
|
||||
|
||||
|
||||
@ -209,10 +214,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
return "";
|
||||
}
|
||||
|
||||
Set<Integer> lbInBoundary = lineBreaks.stream()
|
||||
.map(i -> i+ boundary.start())
|
||||
.filter(boundary::contains)
|
||||
.collect(Collectors.toSet());
|
||||
Set<Integer> lbInBoundary = lineBreaks.stream().map(i -> i + boundary.start()).filter(boundary::contains).collect(Collectors.toSet());
|
||||
if (boundary.end() == getBoundary().end()) {
|
||||
lbInBoundary.add(getBoundary().end());
|
||||
}
|
||||
|
||||
@ -89,7 +89,7 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
||||
return atomicTextBlocks.stream().mapToInt(AtomicTextBlock::numberOfLines).sum();
|
||||
}
|
||||
|
||||
|
||||
@ -114,18 +114,37 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
|
||||
}
|
||||
|
||||
public Boundary getLineBoundary(int lineNumber) {
|
||||
|
||||
if (atomicTextBlocks.size() == 1) {
|
||||
return atomicTextBlocks.get(0).getLineBoundary(lineNumber);
|
||||
}
|
||||
int lineNumberInCurrentBlock = lineNumber;
|
||||
for (AtomicTextBlock atomicTextBlock : atomicTextBlocks) {
|
||||
if (lineNumberInCurrentBlock < atomicTextBlock.numberOfLines()) {
|
||||
return atomicTextBlock.getLineBoundary(lineNumberInCurrentBlock);
|
||||
}
|
||||
lineNumberInCurrentBlock -= atomicTextBlock.numberOfLines();
|
||||
}
|
||||
return new Boundary(boundary.start(), boundary.start());
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositions(stringBoundary);
|
||||
}
|
||||
|
||||
@ -12,6 +12,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
|
||||
|
||||
public interface TextBlock extends CharSequence {
|
||||
|
||||
@ -30,6 +31,10 @@ public interface TextBlock extends CharSequence {
|
||||
int getPreviousLinebreak(int fromIndex);
|
||||
|
||||
|
||||
Boundary getLineBoundary(int lineNumber);
|
||||
|
||||
|
||||
|
||||
List<Integer> getLineBreaks();
|
||||
|
||||
|
||||
@ -48,6 +53,23 @@ public interface TextBlock extends CharSequence {
|
||||
int numberOfLines();
|
||||
|
||||
|
||||
default CharSequence getLine(int lineNumber) {
|
||||
|
||||
return subSequence(getLineBoundary(lineNumber));
|
||||
}
|
||||
|
||||
|
||||
default List<Rectangle2D> getLinePositions(int lineNumber) {
|
||||
|
||||
return getPositions(getLineBoundary(lineNumber));
|
||||
}
|
||||
|
||||
|
||||
default Rectangle2D getLineBBox(int lineNumber) {
|
||||
|
||||
return RectangleTransformations.rectangle2DBBox(getLinePositions(lineNumber));
|
||||
}
|
||||
|
||||
default String searchTextWithLineBreaks() {
|
||||
|
||||
return subSequenceWithLineBreaks(getBoundary());
|
||||
|
||||
@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.layoutparsing.document.service
|
||||
import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RedactionSearchUtility.getExpandedEndByRegex;
|
||||
import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RedactionSearchUtility.getExpandedStartByRegex;
|
||||
import static com.iqser.red.service.redaction.v1.server.redaction.utils.SeparatorUtils.boundaryIsSurroundedBySeparators;
|
||||
import static java.util.stream.Collectors.toMap;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -15,8 +16,10 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.kie.api.runtime.KieSession;
|
||||
|
||||
import com.google.common.base.Functions;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.DocumentTree;
|
||||
@ -26,8 +29,10 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.en
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.NodeType;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.TableCell;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RedactionSearchUtility;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.adapter.NerEntities;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.adapter.NerEntitiesAdapter;
|
||||
@ -61,7 +66,72 @@ public class EntityCreationService {
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsInclusive(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
public Stream<RedactionEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
|
||||
|
||||
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByString(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByString(stop, node.getTextBlock());
|
||||
|
||||
startBoundaries.forEach(boundary -> {
|
||||
boundary.setStart(boundary.start() - start.length());
|
||||
boundary.setEnd(boundary.end() - start.length());
|
||||
});
|
||||
|
||||
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
|
||||
|
||||
startBoundaries.forEach(boundary -> {
|
||||
boundary.setStart(boundary.start() - start.length());
|
||||
boundary.setEnd(boundary.end() - start.length());
|
||||
});
|
||||
|
||||
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByString(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByString(stop, node.getTextBlock());
|
||||
|
||||
stopBoundaries.forEach(boundary -> {
|
||||
boundary.setStart(boundary.start() + stop.length());
|
||||
boundary.setEnd(boundary.end() + stop.length());
|
||||
});
|
||||
|
||||
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
|
||||
|
||||
stopBoundaries.forEach(boundary -> {
|
||||
boundary.setStart(boundary.start() + stop.length());
|
||||
boundary.setEnd(boundary.end() + stop.length());
|
||||
});
|
||||
|
||||
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByString(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByString(stop, node.getTextBlock());
|
||||
@ -79,7 +149,10 @@ public class EntityCreationService {
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsInclusiveIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
|
||||
@ -97,13 +170,7 @@ public class EntityCreationService {
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node) {
|
||||
|
||||
List<Boundary> startBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(start, node.getTextBlock());
|
||||
List<Boundary> stopBoundaries = RedactionSearchUtility.findBoundariesByStringIgnoreCase(stop, node.getTextBlock());
|
||||
|
||||
return betweenBoundaries(startBoundaries, stopBoundaries, type, entityType, node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node) {
|
||||
@ -209,6 +276,55 @@ public class EntityCreationService {
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> lineAfterStringAcrossColumns(String string, String type, EntityType entityType, Table tableNode) {
|
||||
|
||||
return tableNode.streamTableCells()
|
||||
.flatMap(tableCell -> lineAfterBoundariesAcrossColumns(RedactionSearchUtility.findBoundariesByString(string, tableCell.getTextBlock()),
|
||||
tableCell,
|
||||
type,
|
||||
entityType,
|
||||
tableNode));
|
||||
}
|
||||
|
||||
|
||||
public Stream<RedactionEntity> lineAfterStringAcrossColumnsIgnoreCase(String string, String type, EntityType entityType, Table tableNode) {
|
||||
|
||||
return tableNode.streamTableCells()
|
||||
.flatMap(tableCell -> lineAfterBoundariesAcrossColumns(RedactionSearchUtility.findBoundariesByStringIgnoreCase(string, tableCell.getTextBlock()),
|
||||
tableCell,
|
||||
type,
|
||||
entityType,
|
||||
tableNode));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Looks across the remaining table row to the right of the provided TableCell if any line intersects the y coordinates of the found text.
|
||||
*
|
||||
* @param boundaries a list of boundaries
|
||||
* @param tableCell the table cell
|
||||
* @param type the type
|
||||
* @param entityType the entity type
|
||||
* @param tableNode the table node
|
||||
* @return a stream of RedactionEntities
|
||||
*/
|
||||
private Stream<RedactionEntity> lineAfterBoundariesAcrossColumns(List<Boundary> boundaries, TableCell tableCell, String type, EntityType entityType, Table tableNode) {
|
||||
|
||||
return boundaries.stream()
|
||||
.map(boundary -> RectangleTransformations.rectangle2DBBox(tableCell.getTextBlock().getPositions(boundary)))
|
||||
.map(bBox -> Pair.of(bBox.getMaxY(), bBox.getMinY()))
|
||||
.map(maxMinPair -> tableNode.streamRow(tableCell.getRow())
|
||||
.filter(nextTableCell -> nextTableCell.getCol() > tableCell.getCol())
|
||||
.map(nextTableCell -> RedactionSearchUtility.findBoundaryOfAllLinesInYRange(maxMinPair.getLeft(), maxMinPair.getRight(), nextTableCell.getTextBlock()))
|
||||
.map(b -> b.trim(tableNode.getTextBlock()))
|
||||
.filter(boundary -> isValidEntityBoundary(tableNode.getTextBlock(), boundary))
|
||||
.map(boundary -> byBoundary(boundary, type, entityType, tableNode))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get))
|
||||
.flatMap(Functions.identity());
|
||||
}
|
||||
|
||||
|
||||
public Optional<RedactionEntity> semanticNodeAfterString(SemanticNode semanticNode, String string, String type, EntityType entityType) {
|
||||
|
||||
var textBlock = semanticNode.getTextBlock();
|
||||
|
||||
@ -2,11 +2,13 @@ package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.Boundary;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity;
|
||||
@ -86,6 +88,22 @@ public class RedactionSearchUtility {
|
||||
return expandedStart;
|
||||
}
|
||||
|
||||
public static Boundary findBoundaryOfAllLinesInYRange(double maxY, double minY, TextBlock textBlock) {
|
||||
|
||||
List<Boundary> lineBoundaries = IntStream.range(0, textBlock.numberOfLines()).boxed().map(textBlock::getLineBoundary).filter(lineBoundary -> isWithinYRange(maxY, minY, textBlock, lineBoundary)).toList();
|
||||
if (lineBoundaries.isEmpty()) {
|
||||
return new Boundary(textBlock.getBoundary().start(), textBlock.getBoundary().start());
|
||||
}
|
||||
return Boundary.merge(lineBoundaries);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isWithinYRange(double maxY, double minY, TextBlock textBlock, Boundary lineBoundary) {
|
||||
|
||||
Rectangle2D lineBBox = RectangleTransformations.rectangle2DBBox(textBlock.getPositions(lineBoundary));
|
||||
return lineBBox.getMinY() < maxY && minY < lineBBox.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public static List<Boundary> findBoundariesByRegex(String regexPattern, TextBlock textBlock) {
|
||||
|
||||
@ -158,8 +176,8 @@ public class RedactionSearchUtility {
|
||||
|
||||
public static List<Boundary> findBoundariesByStringIgnoreCase(String searchString, TextBlock textBlock) {
|
||||
|
||||
String searchStringLowerCase = searchString.toLowerCase(Locale.ROOT);
|
||||
return findBoundariesByString(searchStringLowerCase, textBlock);
|
||||
Pattern pattern = Pattern.compile(Pattern.quote(searchString), Pattern.CASE_INSENSITIVE);
|
||||
return getBoundariesByPattern(textBlock, 0, pattern);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -46,8 +46,8 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
|
||||
AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/403-17_Fantom_ToxicidadeInalatoriaAguda.pdf",
|
||||
"files/Documine/Flora/ProblemDocs/d75cd9358f7949552697764428183472.TABLES.json");
|
||||
AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
|
||||
"files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");
|
||||
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
|
||||
@ -315,36 +315,36 @@ rule "DOC.5.0: Strain"
|
||||
end
|
||||
|
||||
|
||||
//rule "DOC.7.0: study title by document structure"
|
||||
// when
|
||||
// $table: Table(isOnPage(1),
|
||||
// (containsString("Final Report") || containsString("SPL")),
|
||||
// numberOfRows == 1,
|
||||
// numberOfCols == 1)
|
||||
// then
|
||||
//
|
||||
// entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||
// entity.apply("DOC.7.0", "Study title found", "n-a");
|
||||
// });
|
||||
// end
|
||||
|
||||
|
||||
rule "DOC.7.0: study title"
|
||||
rule "DOC.7.0: study title by document structure"
|
||||
when
|
||||
$section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL")))
|
||||
$table: Table(isOnPage(1),
|
||||
(containsString("Final Report") || containsString("SPL")),
|
||||
numberOfRows == 1,
|
||||
numberOfCols == 1)
|
||||
then
|
||||
entityCreationService.byRegexWithLineBreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
entity.apply("DOC.7.0", "Title found", "n-a");
|
||||
});
|
||||
entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
entity.apply("DOC.7.0", "Title found", "n-a");
|
||||
});
|
||||
entityCreationService.betweenStrings("Laboratories", "SPL", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
entity.apply("DOC.7.0", "Title found", "n-a");
|
||||
|
||||
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||
entity.apply("DOC.7.0", "Study title found", "n-a");
|
||||
});
|
||||
end
|
||||
|
||||
|
||||
//rule "DOC.7.0: study title"
|
||||
// when
|
||||
// $section: Section(isOnPage(1) && (containsString("Final Report") || containsString("SPL")))
|
||||
// then
|
||||
// entityCreationService.byRegexWithLineBreaks("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
// entity.apply("DOC.7.0", "Title found", "n-a");
|
||||
// });
|
||||
// entityCreationService.betweenStrings("TITLE", "DATA REQUIREMENT", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
// entity.apply("DOC.7.0", "Title found", "n-a");
|
||||
// });
|
||||
// entityCreationService.betweenStrings("Laboratories", "SPL", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
|
||||
// entity.apply("DOC.7.0", "Title found", "n-a");
|
||||
// });
|
||||
// end
|
||||
|
||||
|
||||
rule "DOC.8.1: Performing Laboratory (Name)"
|
||||
when
|
||||
$section: Section(containsString("PERFORMING LABORATORY:"))
|
||||
@ -422,8 +422,8 @@ rule "DOC.10.0: Batch number from CoA"
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsString("Analytical Report")
|
||||
|| anyHeadlineContainsString("Certificate of Analysis")
|
||||
|| containsStringIgnoreCase("certificate of analysis")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Certificate of Analysis")
|
||||
|| containsStringIgnoreCase("Certificate of Analysis")
|
||||
)
|
||||
&& (
|
||||
containsStringIgnoreCase("batch")
|
||||
@ -460,16 +460,16 @@ rule "DOC.10.1: Batch number"
|
||||
when
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsString("Test Substance")
|
||||
|| getHeadline().containsString("Test and Control Substances")
|
||||
|| getHeadline().containsString("Test Substances")
|
||||
|| getHeadline().containsString("Test Substance")
|
||||
|| getHeadline().containsString("Test Item")
|
||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
||||
)
|
||||
&& !(
|
||||
getHeadline().containsString("component")
|
||||
|| getHeadline().containsString("reference")
|
||||
|| getHeadline().containsString("blank")
|
||||
anyHeadlineContainsString("component")
|
||||
|| anyHeadlineContainsString("reference")
|
||||
|| anyHeadlineContainsString("blank")
|
||||
)
|
||||
&& containsStringIgnoreCase("batch")
|
||||
)
|
||||
@ -491,6 +491,35 @@ rule "DOC.10.1: Batch number"
|
||||
end
|
||||
|
||||
|
||||
rule "DOC.10.2: Batch number"
|
||||
when
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
||||
)
|
||||
&& !(
|
||||
anyHeadlineContainsString("component")
|
||||
|| anyHeadlineContainsString("reference")
|
||||
|| anyHeadlineContainsString("blank")
|
||||
)
|
||||
&& containsStringIgnoreCase("batch")
|
||||
)
|
||||
$table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
|
||||
then
|
||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
||||
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
||||
});
|
||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
||||
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
||||
});
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
rule "DOC.11.0: Conclusions - LD50, LC50, Confidence"
|
||||
when
|
||||
@ -529,7 +558,7 @@ rule "DOC.12.0: Guideline Deviation"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436","471"))
|
||||
$section: Section(
|
||||
(getHeadline().containsString("General Information") || containsString("GENERAL INFORMATION"))
|
||||
(getHeadline().containsStringIgnoreCase("General Information") || containsString("GENERAL INFORMATION"))
|
||||
&& (containsStringIgnoreCase("from the") || containsStringIgnoreCase("to the"))
|
||||
)
|
||||
then
|
||||
@ -542,7 +571,7 @@ rule "DOC.12.0: Guideline Deviation"
|
||||
entityCreationService.betweenStrings("Deviations from the study plan", "Regulatory Guidelines", "guideline_deviation", EntityType.ENTITY, $section).forEach(entity -> {
|
||||
entity.apply("DOC.12.0", "Deviation from the study plan found", "n-a");
|
||||
});
|
||||
entityCreationService.byRegex("(?>Study plan adherence)(.{1,20}deviations.{1,20} to the study plan.{0,50}\\.)\\s", "guideline_deviation", EntityType.ENTITY, 1, $section).forEach(entity -> {
|
||||
entityCreationService.byRegexIgnoreCase("(?>Study plan adherence)(.{1,20}deviations.{1,20} to the study plan.{0,50}\\.)\\s", "guideline_deviation", EntityType.ENTITY, 1, $section).forEach(entity -> {
|
||||
entity.apply("DOC.12.0", "Guideline deviation found in text.", "n-a");
|
||||
});
|
||||
entityCreationService.betweenStrings("Deviations from the study plan", "validity of the study.", "guideline_deviation", EntityType.ENTITY, $section).forEach(entity -> {
|
||||
@ -579,14 +608,14 @@ rule "DOC.14.0: Dosages"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "425")
|
||||
$section: Section(
|
||||
(anyHeadlineContainsString("Dosages") || anyHeadlineContainsString("Study Design"))
|
||||
(anyHeadlineContainsStringIgnoreCase("Dosages") || anyHeadlineContainsStringIgnoreCase("Study Design"))
|
||||
&& !getHeadline().containsString("TABLE")
|
||||
)
|
||||
then
|
||||
entityCreationService.betweenStringsInclusive("The animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
|
||||
entityCreationService.betweenStringsIncludeStartAndEnd("The animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
|
||||
entity.apply("DOC.14.0", "Dosage found", "n-a");
|
||||
});
|
||||
entityCreationService.betweenStringsInclusive("Animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
|
||||
entityCreationService.betweenStringsIncludeStartAndEnd("Animals were treated", ".", "dosages", EntityType.ENTITY, $section).forEach(entity -> {
|
||||
entity.apply("DOC.14.0", "Dosage found", "n-a");
|
||||
});
|
||||
entityCreationService.byRegexWithLineBreaks("(?:\\.[\\s|\\n]|^.{5,20}\\n)([^\\.]{1,200}(?:animal|given|received)[^\\.]{1,200}dose\\s(?:levels?\\s)?(?:of|at)[^\\.]{1,200})(?:\\.[\\s|\\n|$])", "dosages", EntityType.ENTITY,1, $section).forEach(entity -> {
|
||||
@ -615,7 +644,7 @@ rule "DOC.17.0: Study Conclusion"
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_conclusion", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.17.0", "Study Conclusion found", "n-a"));
|
||||
end
|
||||
|
||||
/*
|
||||
|
||||
rule "DOC.18.0: Weight Behavior Changes"
|
||||
when
|
||||
@ -633,8 +662,84 @@ rule "DOC.18.0: Weight Behavior Changes"
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "weight_behavior_changes", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.18.0", "Weight behavior changes found", "n-a"));
|
||||
end
|
||||
*/
|
||||
|
||||
rule "DOC.18.0: Weight Behavior Changes"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "402")
|
||||
$section: Section(
|
||||
getHeadline().containsString("Results")
|
||||
&& (
|
||||
containsString("body weight")
|
||||
|| containsString("body weights")
|
||||
|| containsString("bodyweight")
|
||||
|| containsString("bodyweights")
|
||||
)
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "weight_behavior_changes", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.18.0", "Weight behavior changes found", "n-a"));
|
||||
end
|
||||
|
||||
rule "DOC.19.0: Necropsy findings"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","436"))
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsStringIgnoreCase("Necropsy")
|
||||
|| getHeadline().containsStringIgnoreCase("Macroscopic Findings")
|
||||
|| getHeadline().containsStringIgnoreCase("Macroscopic examination")
|
||||
)
|
||||
&& !getHeadline().containsStringIgnoreCase("Table")
|
||||
&& !getHeadline().containsStringIgnoreCase("Appendix")
|
||||
&& !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS")
|
||||
//&& !containsString("3 - MACROSCOPIC FINDINGS")
|
||||
//&& !anyHeadlineContainsString("3 - MACROSCOPIC FINDINGS")
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "necropsy_findings", EntityType.ENTITY)
|
||||
.forEach( entity -> entity.apply("DOC.19.0", "Necropsy section found", "n-a"));
|
||||
end
|
||||
|
||||
/*
|
||||
rule "DOC.19.0: Necropsy findings"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","436"))
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsStringIgnoreCase("Necropsy")
|
||||
|| getHeadline().containsStringIgnoreCase("Macroscopic Findings")
|
||||
|| getHeadline().containsStringIgnoreCase("Macroscopic examination")
|
||||
)
|
||||
&& !getHeadline().containsStringIgnoreCase("Table")
|
||||
&& !getHeadline().containsStringIgnoreCase("Appendix")
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "necropsy_findings", EntityType.ENTITY)
|
||||
.forEach( entity -> entity.apply("DOC.19.0", "Necropsy section found", "n-a"));
|
||||
end
|
||||
*/
|
||||
|
||||
rule "DOC.22.0: Clinical observations"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "403")
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsStringIgnoreCase("Clinical Observations")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Clinical observations")
|
||||
|| anyHeadlineContainsStringIgnoreCase("In-life Observations")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Postmortem Observations")
|
||||
)
|
||||
&& !anyHeadlineContainsStringIgnoreCase("Appendix")
|
||||
&& !anyHeadlineContainsStringIgnoreCase("Table")
|
||||
&& !anyHeadlineContainsStringIgnoreCase("Mortality")
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_observations", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.22.0", "Clinical observations section found", "n-a"));
|
||||
end
|
||||
|
||||
/*
|
||||
rule "DOC.19.0: Necropsy findings"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","436"))
|
||||
@ -670,7 +775,7 @@ rule "DOC.22.0: Clinical observations"
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "clinical_observations", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.22.0", "Clinical observations section found", "n-a"));
|
||||
end
|
||||
|
||||
*/
|
||||
|
||||
/* Die beiden waren vorher auch auskommentiert
|
||||
rule "DOC.23.1: Bodyweight changes"
|
||||
@ -723,19 +828,33 @@ rule "DOC.24.0: Study Design"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
|
||||
$section: Section(
|
||||
//retry this with only getHeadline().containsStringIgnoreCase("study design")
|
||||
anyHeadlineContainsStringIgnoreCase("study design")
|
||||
&& !anyHeadlineContainsString("Preliminary screening test")
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_design", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.24.0", "Study design section found", "n-a"));
|
||||
end
|
||||
|
||||
/*
|
||||
rule "DOC.24.0: Study Design"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","404","405","406","428","429","438","439","474","487"))
|
||||
$section: Section(
|
||||
anyHeadlineContainsStringIgnoreCase("study design")
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "study_design", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.24.0", "Study design section found", "n-a"));
|
||||
end
|
||||
*/
|
||||
|
||||
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
|
||||
$section: Section(
|
||||
(getHeadline().containsString("Results") || getHeadline().containsString("Conclusion"))
|
||||
(getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion"))
|
||||
&& !getHeadline().containsString("POSITIVE CONTROL")
|
||||
&& !getHeadline().containsString("Positive Control")
|
||||
&& !getHeadline().containsString("Evaluation")
|
||||
@ -752,7 +871,22 @@ rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
||||
end
|
||||
|
||||
|
||||
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
|
||||
rule "DOC.26.0: Detailing (404 & 405)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405"))
|
||||
$section: Section(
|
||||
anyHeadlineContainsStringIgnoreCase("Results")
|
||||
&& !getHeadline().containsStringIgnoreCase("Evaluation")
|
||||
&& !getHeadline().containsStringIgnoreCase("study")
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "detailing", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.26.0", "Detailing found", "n-a"));
|
||||
end
|
||||
|
||||
|
||||
/*
|
||||
rule "DOC.26.0: Detailing (404 & 405)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("404","405"))
|
||||
@ -766,13 +900,13 @@ rule "DOC.26.0: Detailing (404 & 405)"
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "detailing", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.26.0", "Detailing found", "n-a"));
|
||||
end
|
||||
|
||||
*/
|
||||
|
||||
rule "DOC.32.0: Preliminary Test Results (429)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "429")
|
||||
$section: Section(
|
||||
((getHeadline().containsString("Preliminary Screening Test") && containsString("Clinical observations"))
|
||||
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|
||||
|| getHeadline().containsString("Pre-Experiment"))
|
||||
)
|
||||
then
|
||||
@ -942,18 +1076,28 @@ rule "DOC.40.0: Positive Control"
|
||||
rule "DOC.42.0: Mortality Statement"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "402")
|
||||
$headline: Headline(containsString("Mortality") && !containsString("TABLE"))
|
||||
$headline: Headline(containsStringIgnoreCase("Mortality") && !containsString("TABLE"))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality_statement", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.42.0", "Mortality Statement found", "n-a"));
|
||||
end
|
||||
|
||||
/*
|
||||
rule "DOC.42.0: Mortality Statement"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "402")
|
||||
$headline: Headline(containsString("Mortality") && !containsString("TABLE"))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "mortality_statement", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.42.0", "Mortality Statement found", "n-a"));
|
||||
end
|
||||
*/
|
||||
|
||||
rule "DOC.43.0: Dose Mortality"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "425")
|
||||
$table: Table(
|
||||
(hasHeader("Mortality") || hasHeader("Long Term Results") || hasHeader("Long Term Outcome") || hasHeader("Comments") || hasHeader("Viability / Mortality") || hasHeader("Viability/Mortality"))
|
||||
(hasHeader("Mortality") || hasHeader("Long Term Results") || hasHeader("LongTerm Outcome") || hasHeader("Long Term Outcome") || hasHeader("Comments") || hasHeader("Viability / Mortality") || hasHeader("Viability/Mortality"))
|
||||
&&
|
||||
(hasHeader("Dose [mg/kg bodyweight]") || hasHeader("Dose [mg/kg body weight]") ||hasHeader("Dose (mg/kg)") || hasHeader("Dose levei (mg/kg)") || hasHeader("Dose Level (mg/kg)") || hasHeader("Dose level (mg/kg)") || hasHeader("Dosage [mg/kg body weight]"))
|
||||
)
|
||||
@ -962,6 +1106,7 @@ rule "DOC.43.0: Dose Mortality"
|
||||
$table.streamTableCellsWithHeader("Comments"),
|
||||
$table.streamTableCellsWithHeader("Long Term Results"),
|
||||
$table.streamTableCellsWithHeader("Long Term Outcome"),
|
||||
$table.streamTableCellsWithHeader("LongTerm Outcome"),
|
||||
$table.streamTableCellsWithHeader("Viability / Mortality"),
|
||||
$table.streamTableCellsWithHeader("Viability/Mortality")
|
||||
).flatMap(a -> a)
|
||||
@ -1110,6 +1255,7 @@ rule "X.0.0: remove Entity contained by Entity of same type"
|
||||
retract($contained);
|
||||
end
|
||||
|
||||
|
||||
// Rule unit: X.7
|
||||
rule "X.7.0: remove all images"
|
||||
salience 512
|
||||
@ -1120,6 +1266,7 @@ rule "X.7.0: remove all images"
|
||||
retract($image);
|
||||
end
|
||||
|
||||
|
||||
//------------------------------------ File attributes rules ------------------------------------
|
||||
|
||||
// Rule unit: FA.1
|
||||
@ -1134,8 +1281,6 @@ rule "FA.1.0: remove duplicate FileAttributes"
|
||||
end
|
||||
|
||||
|
||||
//------------------------------------ Local dictionary search rules ------------------------------------
|
||||
|
||||
// Rule unit: LDS.0
|
||||
rule "LDS.0.0: run local dictionary search"
|
||||
agenda-group "LOCAL_DICTIONARY_ADDS"
|
||||
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user