RED-5253: Improved headline detection for DocuMine 2 #60

Merged
dominique.eiflaender1 merged 1 commits from RED-5253 into master 2023-07-24 16:25:01 +02:00
3 changed files with 81 additions and 43 deletions

View File

@ -69,20 +69,7 @@ public class DocuMineClassificationService implements ClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
@ -95,6 +82,19 @@ public class DocuMineClassificationService implements ClassificationService {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);

View File

@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Disabled
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/F.2. A16003E - Acute Inhalation Study.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf");
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf",
// "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json");

View File

@ -55,16 +55,30 @@ query "getFileAttributes"
//---------------------------------------------------------------------------
rule "H.0.0 retract table of contents page"
when
$page: Page(getMainBodyTextBlock().getSearchText().contains("........"))
$page: Page(getMainBodyTextBlock().getSearchText().contains("........") || (getMainBodyTextBlock().getSearchText().contains("APPENDICES") && getMainBodyTextBlock().getSearchText().contains("TABLES")))
$node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1))
then
retract($node);
end
rule "H.0.0: Ignore Table of Contents"
salience 10
when
$tocHeadline: Headline(containsString("CONTENTS"))
then
$tocHeadline.getParent().getPages()
.forEach(page -> page.getMainBody().stream()
.filter(node -> node.getPages().stream().noneMatch(nodePage -> nodePage.getNumber() < page.getNumber()))
.forEach(node -> retract(node))
);
end
// Rule unit: MAN.0
rule "H.0.0: Show headlines"
when
@ -262,6 +276,7 @@ rule "DOC.3.1: Experimental Completion Date"
// ignore species and strain in irrelevant study types
rule "DOC.4.1: Species"
salience 1
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487"))
$section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain"))
@ -275,6 +290,7 @@ rule "DOC.3.1: Experimental Completion Date"
// hide all skipped species and strains except in the relevant sections
rule "DOC.4.2: Species"
salience 1
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436"))
$section: Section(
@ -329,7 +345,7 @@ rule "DOC.7.0: study title by document structure"
$table: Table(isOnPage(1),
(containsString("Final Report") || containsString("SPL")),
numberOfRows == 1,
numberOfCols == 1, getCell(0,0).streamChildren().count() == 3)
numberOfCols == 1)
then
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
@ -359,6 +375,7 @@ rule "DOC.7.2: study title"
end
rule "DOC.8.1: Performing Laboratory (Name)"
when
$section: Section(containsString("PERFORMING LABORATORY:"))
@ -474,10 +491,8 @@ rule "DOC.10.1: Batch number"
when
$section: Section(
(
anyHeadlineContainsStringIgnoreCase("Test Substance")
anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test Item")
)
&& !(
@ -511,8 +526,6 @@ rule "DOC.10.2: Batch number"
(
anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test Item")
)
&& !(
@ -522,12 +535,18 @@ rule "DOC.10.2: Batch number"
)
&& containsStringIgnoreCase("batch")
)
$table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
$batchNumber: String() from List.of("Batch Identification",
"Batch number:",
"Batch reference number:",
"Batch:",
"Batch/Lot number:",
"Batch (Lot) Number:",
"Batch Number:",
"Batch Nº:",
"Batch no:")
$table: Table(containsStringIgnoreCase($batchNumber)) from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
then
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entity.apply("DOC.10.2", "Batch number found", "n-a");
});
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase($batchNumber, "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entity.apply("DOC.10.2", "Batch number found", "n-a");
});
end
@ -611,12 +630,22 @@ rule "DOC.12.1: Guideline Deviation in text"
rule "DOC.13.0: Clinical Signs"
when
FileAttribute(label == "OECD Number", value == "425")
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE") && !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS"))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
end
/*
rule "DOC.13.0: Clinical Signs"
when
FileAttribute(label == "OECD Number", value == "425")
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
end
*/
rule "DOC.14.0: Dosages"
when
@ -879,24 +908,33 @@ rule "DOC.24.0: Study Design"
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
$section: Section(
(getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion"))
&& !getHeadline().containsString("POSITIVE CONTROL")
&& !getHeadline().containsString("Positive Control")
&& !getHeadline().containsString("Evaluation")
&& !getHeadline().containsString("Micronucleus")
&& !getHeadline().containsString("TABLE")
&& !getHeadline().containsString("DISCUSSION")
&& !getHeadline().containsString("CONCLUSIONS")
&& !getHeadline().containsString("Interpretation")
&& !getHeadline().containsString("Viability")
)
$parentHeadline: Headline(
containsAnyStringIgnoreCase("Results", "Conclusion"),
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
$sectionIdentifier: getSectionIdentifier()
)
not Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY)
entityCreationService.bySemanticNodeParagraphsOnly($parentHeadline.getParent(), "results_and_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a"));
end
rule "DOC.25.1: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
Headline(
containsAnyStringIgnoreCase("Results", "Conclusion"),
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
$sectionIdentifier: getSectionIdentifier()
)
$headline: Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "results_and_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.25.1", "Results and Conclusion found", "n-a"));
end
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
rule "DOC.26.0: Detailing (404 & 405)"
when
@ -933,7 +971,7 @@ rule "DOC.32.0: Preliminary Test Results (429)"
FileAttribute(label == "OECD Number", value == "429")
$section: Section(
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|| getHeadline().containsString("Pre-Experiment"))
|| anyHeadlineContainsString("Pre-Experiment"))
)
then
entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY)