RED-5253: Improved headline detection for DocuMine 2 #60

Merged
dominique.eiflaender1 merged 1 commits from RED-5253 into master 2023-07-24 16:25:01 +02:00
3 changed files with 81 additions and 43 deletions

View File

@ -69,20 +69,7 @@ public class DocuMineClassificationService implements ClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER); textBlock.setClassification(PageBlockType.HEADER);
@ -95,6 +82,19 @@ public class DocuMineClassificationService implements ClassificationService {
if (!Pattern.matches("[0-9]+", textBlock.toString())) { if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE); textBlock.setClassification(PageBlockType.TITLE);
} }
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);

View File

@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Disabled @Disabled
public void titleExtraction() throws IOException { public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/F.2. A16003E - Acute Inhalation Study.pdf"); AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf");
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf", // AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf",
// "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json"); // "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json");

View File

@ -55,16 +55,30 @@ query "getFileAttributes"
//--------------------------------------------------------------------------- //---------------------------------------------------------------------------
rule "H.0.0 retract table of contents page" rule "H.0.0 retract table of contents page"
when when
$page: Page(getMainBodyTextBlock().getSearchText().contains("........")) $page: Page(getMainBodyTextBlock().getSearchText().contains("........") || (getMainBodyTextBlock().getSearchText().contains("APPENDICES") && getMainBodyTextBlock().getSearchText().contains("TABLES")))
$node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1)) $node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1))
then then
retract($node); retract($node);
end end
rule "H.0.0: Ignore Table of Contents"
salience 10
when
$tocHeadline: Headline(containsString("CONTENTS"))
then
$tocHeadline.getParent().getPages()
.forEach(page -> page.getMainBody().stream()
.filter(node -> node.getPages().stream().noneMatch(nodePage -> nodePage.getNumber() < page.getNumber()))
.forEach(node -> retract(node))
);
end
// Rule unit: MAN.0 // Rule unit: MAN.0
rule "H.0.0: Show headlines" rule "H.0.0: Show headlines"
when when
@ -262,6 +276,7 @@ rule "DOC.3.1: Experimental Completion Date"
// ignore species and strain in irrelevant study types // ignore species and strain in irrelevant study types
rule "DOC.4.1: Species" rule "DOC.4.1: Species"
salience 1
when when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487")) FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487"))
$section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain")) $section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain"))
@ -275,6 +290,7 @@ rule "DOC.3.1: Experimental Completion Date"
// hide all skipped species and strains except in the relevant sections // hide all skipped species and strains except in the relevant sections
rule "DOC.4.2: Species" rule "DOC.4.2: Species"
salience 1
when when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436")) FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436"))
$section: Section( $section: Section(
@ -329,7 +345,7 @@ rule "DOC.7.0: study title by document structure"
$table: Table(isOnPage(1), $table: Table(isOnPage(1),
(containsString("Final Report") || containsString("SPL")), (containsString("Final Report") || containsString("SPL")),
numberOfRows == 1, numberOfRows == 1,
numberOfCols == 1, getCell(0,0).streamChildren().count() == 3) numberOfCols == 1)
then then
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> { entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
@ -359,6 +375,7 @@ rule "DOC.7.2: study title"
end end
rule "DOC.8.1: Performing Laboratory (Name)" rule "DOC.8.1: Performing Laboratory (Name)"
when when
$section: Section(containsString("PERFORMING LABORATORY:")) $section: Section(containsString("PERFORMING LABORATORY:"))
@ -476,8 +493,6 @@ rule "DOC.10.1: Batch number"
( (
anyHeadlineContainsStringIgnoreCase("Test Substance") anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances") || anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test Item") || anyHeadlineContainsStringIgnoreCase("Test Item")
) )
&& !( && !(
@ -511,8 +526,6 @@ rule "DOC.10.2: Batch number"
( (
anyHeadlineContainsStringIgnoreCase("Test Substance") anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances") || anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|| anyHeadlineContainsStringIgnoreCase("Test Item") || anyHeadlineContainsStringIgnoreCase("Test Item")
) )
&& !( && !(
@ -522,12 +535,18 @@ rule "DOC.10.2: Batch number"
) )
&& containsStringIgnoreCase("batch") && containsStringIgnoreCase("batch")
) )
$table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList() $batchNumber: String() from List.of("Batch Identification",
"Batch number:",
"Batch reference number:",
"Batch:",
"Batch/Lot number:",
"Batch (Lot) Number:",
"Batch Number:",
"Batch Nº:",
"Batch no:")
$table: Table(containsStringIgnoreCase($batchNumber)) from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
then then
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> { entityCreationService.lineAfterStringAcrossColumnsIgnoreCase($batchNumber, "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entity.apply("DOC.10.2", "Batch number found", "n-a");
});
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
entity.apply("DOC.10.2", "Batch number found", "n-a"); entity.apply("DOC.10.2", "Batch number found", "n-a");
}); });
end end
@ -611,12 +630,22 @@ rule "DOC.12.1: Guideline Deviation in text"
rule "DOC.13.0: Clinical Signs" rule "DOC.13.0: Clinical Signs"
when when
FileAttribute(label == "OECD Number", value == "425") FileAttribute(label == "OECD Number", value == "425")
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE")) $headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE") && !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS"))
then then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY) entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a")); .forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
end end
/*
rule "DOC.13.0: Clinical Signs"
when
FileAttribute(label == "OECD Number", value == "425")
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
end
*/
rule "DOC.14.0: Dosages" rule "DOC.14.0: Dosages"
when when
@ -879,24 +908,33 @@ rule "DOC.24.0: Study Design"
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)" rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
when when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487")) FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
$section: Section( $parentHeadline: Headline(
(getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion")) containsAnyStringIgnoreCase("Results", "Conclusion"),
&& !getHeadline().containsString("POSITIVE CONTROL") !containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
&& !getHeadline().containsString("Positive Control") $sectionIdentifier: getSectionIdentifier()
&& !getHeadline().containsString("Evaluation")
&& !getHeadline().containsString("Micronucleus")
&& !getHeadline().containsString("TABLE")
&& !getHeadline().containsString("DISCUSSION")
&& !getHeadline().containsString("CONCLUSIONS")
&& !getHeadline().containsString("Interpretation")
&& !getHeadline().containsString("Viability")
) )
not Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
then then
entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY) entityCreationService.bySemanticNodeParagraphsOnly($parentHeadline.getParent(), "results_and_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a")); .forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a"));
end end
rule "DOC.25.1: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
when
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
Headline(
containsAnyStringIgnoreCase("Results", "Conclusion"),
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
$sectionIdentifier: getSectionIdentifier()
)
$headline: Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
then
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "results_and_conclusion", EntityType.ENTITY)
.forEach(entity -> entity.apply("DOC.25.1", "Results and Conclusion found", "n-a"));
end
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first. // TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
rule "DOC.26.0: Detailing (404 & 405)" rule "DOC.26.0: Detailing (404 & 405)"
when when
@ -933,7 +971,7 @@ rule "DOC.32.0: Preliminary Test Results (429)"
FileAttribute(label == "OECD Number", value == "429") FileAttribute(label == "OECD Number", value == "429")
$section: Section( $section: Section(
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations")) ((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|| getHeadline().containsString("Pre-Experiment")) || anyHeadlineContainsString("Pre-Experiment"))
) )
then then
entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY) entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY)