From e930a124bac67386f96bae2bbc2c0b42a9bf0063 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Mon, 24 Jul 2023 16:19:22 +0200 Subject: [PATCH] RED-5253: Improved headline detection for DocuMine 2 --- .../DocuMineClassificationService.java | 28 +++--- .../v1/server/DocumineFloraTest.java | 2 +- .../test/resources/drools/documine_flora.drl | 94 +++++++++++++------ 3 files changed, 81 insertions(+), 43 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java index 50f0a1f9..b91d01fa 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java @@ -69,20 +69,7 @@ public class DocuMineClassificationService implements ClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() - .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 - - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() - .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) { - textBlock.setClassification(PageBlockType.getHeadlineType(1)); - document.setHeadlines(true); - - } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) { - textBlock.setClassification(PageBlockType.getHeadlineType(2)); - document.setHeadlines(true); - } else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); @@ -95,6 +82,19 @@ public class DocuMineClassificationService implements ClassificationService { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } + } else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() + .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 + + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() + .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() + .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() + .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) { + textBlock.setClassification(PageBlockType.getHeadlineType(1)); + document.setHeadlines(true); + + } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) { + textBlock.setClassification(PageBlockType.getHeadlineType(2)); + document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index 79e6871b..0e5b4516 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { @Disabled public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/F.2. A16003E - Acute Inhalation Study.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf"); // AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf", // "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl index 43381096..483eefd1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl @@ -55,16 +55,30 @@ query "getFileAttributes" //--------------------------------------------------------------------------- - rule "H.0.0 retract table of contents page" when - $page: Page(getMainBodyTextBlock().getSearchText().contains("........")) + $page: Page(getMainBodyTextBlock().getSearchText().contains("........") || (getMainBodyTextBlock().getSearchText().contains("APPENDICES") && getMainBodyTextBlock().getSearchText().contains("TABLES"))) $node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1)) then retract($node); end +rule "H.0.0: Ignore Table of Contents" + salience 10 + when + $tocHeadline: Headline(containsString("CONTENTS")) + + then + $tocHeadline.getParent().getPages() + .forEach(page -> page.getMainBody().stream() + .filter(node -> node.getPages().stream().noneMatch(nodePage -> nodePage.getNumber() < page.getNumber())) + .forEach(node -> retract(node)) + ); + end + + + // Rule unit: MAN.0 rule "H.0.0: Show headlines" when @@ -262,6 +276,7 @@ rule "DOC.3.1: Experimental Completion Date" // ignore species and strain in irrelevant study types rule "DOC.4.1: Species" + salience 1 when FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","471","474","487")) $section: Section(hasEntitiesOfType("species") || hasEntitiesOfType("strain")) @@ -275,6 +290,7 @@ rule "DOC.3.1: Experimental Completion Date" // hide all skipped species and strains except in the relevant sections rule "DOC.4.2: Species" + salience 1 when FileAttribute(label == "OECD Number", valueEqualsAnyOf("402","403","404","405","425","429","436")) $section: Section( @@ -329,7 +345,7 @@ rule "DOC.7.0: study title by document structure" $table: Table(isOnPage(1), (containsString("Final Report") || containsString("SPL")), numberOfRows == 1, - numberOfCols == 1, getCell(0,0).streamChildren().count() == 3) + numberOfCols == 1) then entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> { @@ -359,6 +375,7 @@ rule "DOC.7.2: study title" end + rule "DOC.8.1: Performing Laboratory (Name)" when $section: Section(containsString("PERFORMING LABORATORY:")) @@ -474,10 +491,8 @@ rule "DOC.10.1: Batch number" when $section: Section( ( - anyHeadlineContainsStringIgnoreCase("Test Substance") + anyHeadlineContainsStringIgnoreCase("Test Substance") || anyHeadlineContainsStringIgnoreCase("Test and Control Substances") - || anyHeadlineContainsStringIgnoreCase("Test Substances") - || anyHeadlineContainsStringIgnoreCase("Test Substance") || anyHeadlineContainsStringIgnoreCase("Test Item") ) && !( @@ -511,8 +526,6 @@ rule "DOC.10.2: Batch number" ( anyHeadlineContainsStringIgnoreCase("Test Substance") || anyHeadlineContainsStringIgnoreCase("Test and Control Substances") - || anyHeadlineContainsStringIgnoreCase("Test Substances") - || anyHeadlineContainsStringIgnoreCase("Test Substance") || anyHeadlineContainsStringIgnoreCase("Test Item") ) && !( @@ -522,12 +535,18 @@ rule "DOC.10.2: Batch number" ) && containsStringIgnoreCase("batch") ) - $table: Table() from $section.streamAllSubNodesOfType(NodeType.TABLE).toList() + $batchNumber: String() from List.of("Batch Identification", + "Batch number:", + "Batch reference number:", + "Batch:", + "Batch/Lot number:", + "Batch (Lot) Number:", + "Batch Number:", + "Batch NÂș:", + "Batch no:") + $table: Table(containsStringIgnoreCase($batchNumber)) from $section.streamAllSubNodesOfType(NodeType.TABLE).toList() then - entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> { - entity.apply("DOC.10.2", "Batch number found", "n-a"); - }); - entityCreationService.lineAfterStringAcrossColumnsIgnoreCase("Batch (Lot) Number:", "batch_number", EntityType.ENTITY, $table).forEach(entity -> { + entityCreationService.lineAfterStringAcrossColumnsIgnoreCase($batchNumber, "batch_number", EntityType.ENTITY, $table).forEach(entity -> { entity.apply("DOC.10.2", "Batch number found", "n-a"); }); end @@ -611,12 +630,22 @@ rule "DOC.12.1: Guideline Deviation in text" rule "DOC.13.0: Clinical Signs" when FileAttribute(label == "OECD Number", value == "425") - $headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE")) + $headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE") && !getHeadline().containsStringIgnoreCase("3 - MACROSCOPIC FINDINGS")) then entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY) .forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a")); end +/* +rule "DOC.13.0: Clinical Signs" + when + FileAttribute(label == "OECD Number", value == "425") + $headline: Headline(containsAnyStringIgnoreCase("Clinical Signs", "Macroscopic Findings") && !containsString("TABLE")) + then + entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "clinical_signs", EntityType.ENTITY) + .forEach(entity -> entity.apply("DOC.13.0", "Clinical Signs found", "n-a")); + end +*/ rule "DOC.14.0: Dosages" when @@ -879,24 +908,33 @@ rule "DOC.24.0: Study Design" rule "DOC.25.0: Results and Conclusion (406, 428, 438, 439, 474 & 487)" when FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487")) - $section: Section( - (getHeadline().containsStringIgnoreCase("Results") || getHeadline().containsStringIgnoreCase("Conclusion")) - && !getHeadline().containsString("POSITIVE CONTROL") - && !getHeadline().containsString("Positive Control") - && !getHeadline().containsString("Evaluation") - && !getHeadline().containsString("Micronucleus") - && !getHeadline().containsString("TABLE") - && !getHeadline().containsString("DISCUSSION") - && !getHeadline().containsString("CONCLUSIONS") - && !getHeadline().containsString("Interpretation") - && !getHeadline().containsString("Viability") - ) + $parentHeadline: Headline( + containsAnyStringIgnoreCase("Results", "Conclusion"), + !containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"), + $sectionIdentifier: getSectionIdentifier() + ) + not Headline(getSectionIdentifier().isChildOf($sectionIdentifier)) then - entityCreationService.bySemanticNodeParagraphsOnly($section, "results_and_conclusion", EntityType.ENTITY) + entityCreationService.bySemanticNodeParagraphsOnly($parentHeadline.getParent(), "results_and_conclusion", EntityType.ENTITY) .forEach(entity -> entity.apply("DOC.25.0", "Results and Conclusion found", "n-a")); end +rule "DOC.25.1: Results and Conclusion (406, 428, 438, 439, 474 & 487)" + when + FileAttribute(label == "OECD Number", valueEqualsAnyOf("406","428","438","439","474","487")) + Headline( + containsAnyStringIgnoreCase("Results", "Conclusion"), + !containsAnyString("POSITIVE CONTROL", "Positive Control", "Evaluation", "Micronucleus", "TABLE", "DISCUSSION", "CONCLUSIONS", "Interpretation","Viability"), + $sectionIdentifier: getSectionIdentifier() + ) + $headline: Headline(getSectionIdentifier().isChildOf($sectionIdentifier)) + then + entityCreationService.bySemanticNodeParagraphsOnly($headline.getParent(), "results_and_conclusion", EntityType.ENTITY) + .forEach(entity -> entity.apply("DOC.25.1", "Results and Conclusion found", "n-a")); + end + + // TBD: This rule now finds both Results and RESULTS AND DISCUSSION. This ensures that we do not have empty Components in some of the files. In RESULTS AND DISCUSSION we should find every Subsection, not just the first. rule "DOC.26.0: Detailing (404 & 405)" when @@ -933,7 +971,7 @@ rule "DOC.32.0: Preliminary Test Results (429)" FileAttribute(label == "OECD Number", value == "429") $section: Section( ((anyHeadlineContainsString("Preliminary Screening Test") && containsString("Clinical observations")) - || getHeadline().containsString("Pre-Experiment")) + || anyHeadlineContainsString("Pre-Experiment")) ) then entityCreationService.bySemanticNodeParagraphsOnly($section, "preliminary_test_results", EntityType.ENTITY) -- 2.47.2