RED-5253: Improved headline detection for DocuMine 2 #60
@ -69,20 +69,7 @@ public class DocuMineClassificationService implements ClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
@ -95,6 +82,19 @@ public class DocuMineClassificationService implements ClassificationService {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
|
||||
@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
||||
@Disabled
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/F.2. A16003E - Acute Inhalation Study.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf");
|
||||
|
||||
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf",
|
||||
// "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json");
|
||||
|
||||
@ -55,16 +55,30 @@ query "getFileAttributes"
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
|
||||
rule "H.0.0 retract table of contents page"
|
||||
when
|
||||
$page: Page(getMainBodyTextBlock().getSearchText().contains("........"))
|
||||
$page: Page(getMainBodyTextBlock().getSearchText().contains("........") || (getMainBodyTextBlock().getSearchText().contains("APPENDICES") && getMainBodyTextBlock().getSearchText().contains("TABLES")))
|
||||
$node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1))
|
||||
then
|
||||
retract($node);
|
||||
end
|
||||
|
||||
|
||||
rule "H.0.0: Ignore Table of Contents"
|
||||
salience 10
|
||||
when
|
||||
$tocHeadline: Headline(containsString("CONTENTS"))
|
||||
|
||||
then
|
||||
$tocHeadline.getParent().getPages()
|
||||
.forEach(page -> page.getMainBody().stream()
|
||||
.filter(node -> node.getPages().stream().noneMatch(nodePage -> nodePage.getNumber() < page.getNumber()))
|
||||
.forEach(node -> retract(node))
|
||||
);
|
||||
end
|
||||
|
||||
|
||||
|
||||
// Rule unit: MAN.0
|
||||
rule "H.0.0: Show headlines"
|
||||
when
|
||||
@ -262,6 +276,7 @@ rule "DOC.3.1: Experimental Completion Date"
|
||||
|
||||
// ignore species and strain in irrelevant study types
|
||||
rule "DOC.4.1: Species"
|
||||
salience 1
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487"))
|
||||
$section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain"))
|
||||
@ -275,6 +290,7 @@ rule "DOC.3.1: Experimental Completion Date"
|
||||
|
||||
// hide all skipped species and strains except in the relevant sections
|
||||
rule "DOC.4.2: Species"
|
||||
salience 1
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436"))
|
||||
$section: Section(
|
||||
@ -329,7 +345,7 @@ rule "DOC.7.0: study title by document structure"
|
||||
$table: Table(isOnPage(1),
|
||||
(containsString("Final Report") || containsString("SPL")),
|
||||
numberOfRows == 1,
|
||||
numberOfCols == 1, getCell(0,0).streamChildren().count() == 3)
|
||||
numberOfCols == 1)
|
||||
then
|
||||
|
||||
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||
@ -359,6 +375,7 @@ rule "DOC.7.2: study title"
|
||||
end
|
||||
|
||||
|
||||
|
||||
rule "DOC.8.1: Performing Laboratory (Name)"
|
||||
when
|
||||
$section: Section(containsString("PERFORMING LABORATORY:"))
|
||||
@ -474,10 +491,8 @@ rule "DOC.10.1: Batch number"
|
||||
when
|
||||
$section: Section(
|
||||
(
|
||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
||||
)
|
||||
&& !(
|
||||
@ -511,8 +526,6 @@ rule "DOC.10.2: Batch number"
|
||||
(
|
||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
||||
)
|
||||
&& !(
|
||||
@ -522,12 +535,18 @@ rule "DOC.10.2: Batch number"
|
||||
)
|
||||
&& containsStringIgnoreCase("batch")
|
||||
)
|
||||
$table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
|
||||
$batchNumber: String() from List.of("Batch Identification",
|
||||
"Batch number:",
|
||||
"Batch reference number:",
|
||||
"Batch:",
|
||||
"Batch/Lot number:",
|
||||
"Batch (Lot) Number:",
|
||||
"Batch Number:",
|
||||
"Batch Nº:",
|
||||
"Batch no:")
|
||||
$table: Table(containsStringIgnoreCase($batchNumber)) from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
|
||||
then
|
||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
||||
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
||||
});
|
||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase($batchNumber, "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
||||
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
||||
});
|
||||
end
|
||||
@ -611,12 +630,22 @@ rule "DOC.12.1: Guideline Deviation in text"
|
||||
rule "DOC.13.0: Clinical Signs"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "425")
|
||||
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
|
||||
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE") && !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS"))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
|
||||
end
|
||||
|
||||
/*
|
||||
rule "DOC.13.0: Clinical Signs"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", value == "425")
|
||||
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
|
||||
end
|
||||
*/
|
||||
|
||||
rule "DOC.14.0: Dosages"
|
||||
when
|
||||
@ -879,24 +908,33 @@ rule "DOC.24.0: Study Design"
|
||||
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
|
||||
$section: Section(
|
||||
(getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion"))
|
||||
&& !getHeadline().containsString("POSITIVE CONTROL")
|
||||
&& !getHeadline().containsString("Positive Control")
|
||||
&& !getHeadline().containsString("Evaluation")
|
||||
&& !getHeadline().containsString("Micronucleus")
|
||||
&& !getHeadline().containsString("TABLE")
|
||||
&& !getHeadline().containsString("DISCUSSION")
|
||||
&& !getHeadline().containsString("CONCLUSIONS")
|
||||
&& !getHeadline().containsString("Interpretation")
|
||||
&& !getHeadline().containsString("Viability")
|
||||
)
|
||||
$parentHeadline: Headline(
|
||||
containsAnyStringIgnoreCase("Results", "Conclusion"),
|
||||
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
|
||||
$sectionIdentifier: getSectionIdentifier()
|
||||
)
|
||||
not Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY)
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($parentHeadline.getParent(), "results_and_conclusion", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a"));
|
||||
end
|
||||
|
||||
|
||||
rule "DOC.25.1: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
||||
when
|
||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
|
||||
Headline(
|
||||
containsAnyStringIgnoreCase("Results", "Conclusion"),
|
||||
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
|
||||
$sectionIdentifier: getSectionIdentifier()
|
||||
)
|
||||
$headline: Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "results_and_conclusion", EntityType.ENTITY)
|
||||
.forEach(entity -> entity.apply("DOC.25.1", "Results and Conclusion found", "n-a"));
|
||||
end
|
||||
|
||||
|
||||
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
|
||||
rule "DOC.26.0: Detailing (404 & 405)"
|
||||
when
|
||||
@ -933,7 +971,7 @@ rule "DOC.32.0: Preliminary Test Results (429)"
|
||||
FileAttribute(label == "OECD Number", value == "429")
|
||||
$section: Section(
|
||||
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|
||||
|| getHeadline().containsString("Pre-Experiment"))
|
||||
|| anyHeadlineContainsString("Pre-Experiment"))
|
||||
)
|
||||
then
|
||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user