RED-5253: Improved headline detection for DocuMine 2 #60
@ -69,20 +69,7 @@ public class DocuMineClassificationService implements ClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
|
||||||
|
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
|
||||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
|
||||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
|
||||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
|
||||||
document.setHeadlines(true);
|
|
||||||
|
|
||||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
|
||||||
document.setHeadlines(true);
|
|
||||||
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
@ -95,6 +82,19 @@ public class DocuMineClassificationService implements ClassificationService {
|
|||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
|
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||||
|
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
|
|
||||||
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||||
|
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||||
|
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||||
|
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
||||||
|
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||||
|
document.setHeadlines(true);
|
||||||
|
|
||||||
|
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||||
|
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||||
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
|
|||||||
@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void titleExtraction() throws IOException {
|
public void titleExtraction() throws IOException {
|
||||||
|
|
||||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/F.2. A16003E - Acute Inhalation Study.pdf");
|
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf");
|
||||||
|
|
||||||
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf",
|
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf",
|
||||||
// "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json");
|
// "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json");
|
||||||
|
|||||||
@ -55,16 +55,30 @@ query "getFileAttributes"
|
|||||||
|
|
||||||
//---------------------------------------------------------------------------
|
//---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
rule "H.0.0 retract table of contents page"
|
rule "H.0.0 retract table of contents page"
|
||||||
when
|
when
|
||||||
$page: Page(getMainBodyTextBlock().getSearchText().contains("........"))
|
$page: Page(getMainBodyTextBlock().getSearchText().contains("........") || (getMainBodyTextBlock().getSearchText().contains("APPENDICES") && getMainBodyTextBlock().getSearchText().contains("TABLES")))
|
||||||
$node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1))
|
$node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1))
|
||||||
then
|
then
|
||||||
retract($node);
|
retract($node);
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
rule "H.0.0: Ignore Table of Contents"
|
||||||
|
salience 10
|
||||||
|
when
|
||||||
|
$tocHeadline: Headline(containsString("CONTENTS"))
|
||||||
|
|
||||||
|
then
|
||||||
|
$tocHeadline.getParent().getPages()
|
||||||
|
.forEach(page -> page.getMainBody().stream()
|
||||||
|
.filter(node -> node.getPages().stream().noneMatch(nodePage -> nodePage.getNumber() < page.getNumber()))
|
||||||
|
.forEach(node -> retract(node))
|
||||||
|
);
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Rule unit: MAN.0
|
// Rule unit: MAN.0
|
||||||
rule "H.0.0: Show headlines"
|
rule "H.0.0: Show headlines"
|
||||||
when
|
when
|
||||||
@ -262,6 +276,7 @@ rule "DOC.3.1: Experimental Completion Date"
|
|||||||
|
|
||||||
// ignore species and strain in irrelevant study types
|
// ignore species and strain in irrelevant study types
|
||||||
rule "DOC.4.1: Species"
|
rule "DOC.4.1: Species"
|
||||||
|
salience 1
|
||||||
when
|
when
|
||||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487"))
|
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487"))
|
||||||
$section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain"))
|
$section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain"))
|
||||||
@ -275,6 +290,7 @@ rule "DOC.3.1: Experimental Completion Date"
|
|||||||
|
|
||||||
// hide all skipped species and strains except in the relevant sections
|
// hide all skipped species and strains except in the relevant sections
|
||||||
rule "DOC.4.2: Species"
|
rule "DOC.4.2: Species"
|
||||||
|
salience 1
|
||||||
when
|
when
|
||||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436"))
|
FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436"))
|
||||||
$section: Section(
|
$section: Section(
|
||||||
@ -329,7 +345,7 @@ rule "DOC.7.0: study title by document structure"
|
|||||||
$table: Table(isOnPage(1),
|
$table: Table(isOnPage(1),
|
||||||
(containsString("Final Report") || containsString("SPL")),
|
(containsString("Final Report") || containsString("SPL")),
|
||||||
numberOfRows == 1,
|
numberOfRows == 1,
|
||||||
numberOfCols == 1, getCell(0,0).streamChildren().count() == 3)
|
numberOfCols == 1)
|
||||||
then
|
then
|
||||||
|
|
||||||
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||||
@ -359,6 +375,7 @@ rule "DOC.7.2: study title"
|
|||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
rule "DOC.8.1: Performing Laboratory (Name)"
|
rule "DOC.8.1: Performing Laboratory (Name)"
|
||||||
when
|
when
|
||||||
$section: Section(containsString("PERFORMING LABORATORY:"))
|
$section: Section(containsString("PERFORMING LABORATORY:"))
|
||||||
@ -476,8 +493,6 @@ rule "DOC.10.1: Batch number"
|
|||||||
(
|
(
|
||||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
||||||
)
|
)
|
||||||
&& !(
|
&& !(
|
||||||
@ -511,8 +526,6 @@ rule "DOC.10.2: Batch number"
|
|||||||
(
|
(
|
||||||
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
anyHeadlineContainsStringIgnoreCase("Test Substance")
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
|| anyHeadlineContainsStringIgnoreCase("Test and Control Substances")
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test Substances")
|
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test Substance")
|
|
||||||
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
|| anyHeadlineContainsStringIgnoreCase("Test Item")
|
||||||
)
|
)
|
||||||
&& !(
|
&& !(
|
||||||
@ -522,12 +535,18 @@ rule "DOC.10.2: Batch number"
|
|||||||
)
|
)
|
||||||
&& containsStringIgnoreCase("batch")
|
&& containsStringIgnoreCase("batch")
|
||||||
)
|
)
|
||||||
$table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
|
$batchNumber: String() from List.of("Batch Identification",
|
||||||
|
"Batch number:",
|
||||||
|
"Batch reference number:",
|
||||||
|
"Batch:",
|
||||||
|
"Batch/Lot number:",
|
||||||
|
"Batch (Lot) Number:",
|
||||||
|
"Batch Number:",
|
||||||
|
"Batch Nº:",
|
||||||
|
"Batch no:")
|
||||||
|
$table: Table(containsStringIgnoreCase($batchNumber)) from $section.streamAllSubNodesOfType(NodeType.TABLE).toList()
|
||||||
then
|
then
|
||||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase($batchNumber, "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
||||||
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
|
||||||
});
|
|
||||||
entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> {
|
|
||||||
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
entity.apply("DOC.10.2", "Batch number found", "n-a");
|
||||||
});
|
});
|
||||||
end
|
end
|
||||||
@ -611,12 +630,22 @@ rule "DOC.12.1: Guideline Deviation in text"
|
|||||||
rule "DOC.13.0: Clinical Signs"
|
rule "DOC.13.0: Clinical Signs"
|
||||||
when
|
when
|
||||||
FileAttribute(label == "OECD Number", value == "425")
|
FileAttribute(label == "OECD Number", value == "425")
|
||||||
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
|
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE") && !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS"))
|
||||||
then
|
then
|
||||||
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
|
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
|
||||||
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
|
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
|
||||||
end
|
end
|
||||||
|
|
||||||
|
/*
|
||||||
|
rule "DOC.13.0: Clinical Signs"
|
||||||
|
when
|
||||||
|
FileAttribute(label == "OECD Number", value == "425")
|
||||||
|
$headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE"))
|
||||||
|
then
|
||||||
|
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY)
|
||||||
|
.forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a"));
|
||||||
|
end
|
||||||
|
*/
|
||||||
|
|
||||||
rule "DOC.14.0: Dosages"
|
rule "DOC.14.0: Dosages"
|
||||||
when
|
when
|
||||||
@ -879,24 +908,33 @@ rule "DOC.24.0: Study Design"
|
|||||||
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
||||||
when
|
when
|
||||||
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
|
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
|
||||||
$section: Section(
|
$parentHeadline: Headline(
|
||||||
(getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion"))
|
containsAnyStringIgnoreCase("Results", "Conclusion"),
|
||||||
&& !getHeadline().containsString("POSITIVE CONTROL")
|
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
|
||||||
&& !getHeadline().containsString("Positive Control")
|
$sectionIdentifier: getSectionIdentifier()
|
||||||
&& !getHeadline().containsString("Evaluation")
|
|
||||||
&& !getHeadline().containsString("Micronucleus")
|
|
||||||
&& !getHeadline().containsString("TABLE")
|
|
||||||
&& !getHeadline().containsString("DISCUSSION")
|
|
||||||
&& !getHeadline().containsString("CONCLUSIONS")
|
|
||||||
&& !getHeadline().containsString("Interpretation")
|
|
||||||
&& !getHeadline().containsString("Viability")
|
|
||||||
)
|
)
|
||||||
|
not Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
|
||||||
then
|
then
|
||||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY)
|
entityCreationService.bySemanticNodeParagraphsOnly($parentHeadline.getParent(), "results_and_conclusion", EntityType.ENTITY)
|
||||||
.forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a"));
|
.forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a"));
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
rule "DOC.25.1: Results and Conclusion (406, 428, 438, 439, 474 & 487)"
|
||||||
|
when
|
||||||
|
FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487"))
|
||||||
|
Headline(
|
||||||
|
containsAnyStringIgnoreCase("Results", "Conclusion"),
|
||||||
|
!containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"),
|
||||||
|
$sectionIdentifier: getSectionIdentifier()
|
||||||
|
)
|
||||||
|
$headline: Headline(getSectionIdentifier().isChildOf($sectionIdentifier))
|
||||||
|
then
|
||||||
|
entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "results_and_conclusion", EntityType.ENTITY)
|
||||||
|
.forEach(entity -> entity.apply("DOC.25.1", "Results and Conclusion found", "n-a"));
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
|
// TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first.
|
||||||
rule "DOC.26.0: Detailing (404 & 405)"
|
rule "DOC.26.0: Detailing (404 & 405)"
|
||||||
when
|
when
|
||||||
@ -933,7 +971,7 @@ rule "DOC.32.0: Preliminary Test Results (429)"
|
|||||||
FileAttribute(label == "OECD Number", value == "429")
|
FileAttribute(label == "OECD Number", value == "429")
|
||||||
$section: Section(
|
$section: Section(
|
||||||
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|
((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations"))
|
||||||
|| getHeadline().containsString("Pre-Experiment"))
|
|| anyHeadlineContainsString("Pre-Experiment"))
|
||||||
)
|
)
|
||||||
then
|
then
|
||||||
entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY)
|
entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user