diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java index 122bfaab..8211323d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java @@ -16,6 +16,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock; import lombok.AllArgsConstructor; @@ -35,6 +36,15 @@ public class RectangleTransformations { return annotationPosition; } + public static Rectangle2D abstractPageBlockBBox(List abstractPageBlocks) { + + return abstractPageBlocks.stream() + .map(abstractPageBlock -> new Rectangle2D.Double(abstractPageBlock.getMinX(), + abstractPageBlock.getMinY(), + abstractPageBlock.getWidth(), + abstractPageBlock.getHeight())).collect(new Rectangle2DBBoxCollector()); + } + public static Rectangle2D atomicTextBlockBBox(List atomicTextBlocks) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TableMergingUtility.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TableMergingUtility.java index bd525dbb..51995d0d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TableMergingUtility.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TableMergingUtility.java @@ -14,6 +14,9 @@ import lombok.experimental.UtilityClass; @UtilityClass public class TableMergingUtility { + private static final double TABLE_ALIGNMENT_THRESHOLD = 2d; + + public List findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List pageBlocks) { List consecutiveTables = pageBlocks.stream() @@ -24,7 +27,8 @@ public class TableMergingUtility { List consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>(); for (TablePageBlock consecutiveTable : consecutiveTables) { - if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) { + if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock, + consecutiveTable)) { consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable); } else { break; @@ -34,6 +38,12 @@ public class TableMergingUtility { } + private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) { + + return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD; + } + + private boolean hasTableHeader(TablePageBlock table) { return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index 73e643f7..9de5c4eb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { @Disabled public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/402-16_Fantom_ToxicidadeCutaneaAguda.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf"); // AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf", // "files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl index a84efef4..3e7c8725 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl @@ -320,7 +320,7 @@ rule "DOC.7.0: study title by document structure" $table: Table(isOnPage(1), (containsString("Final Report") || containsString("SPL")), numberOfRows == 1, - numberOfCols == 1) + numberOfCols == 1, getCell(0,0).streamChildren().count() == 3) then entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> { @@ -331,12 +331,9 @@ rule "DOC.7.0: study title by document structure" rule "DOC.7.1: study title" when - $table: Table(isOnPage(1), - (containsString("Final Report") || containsString("SPL")), - numberOfRows == 1, - numberOfCols == 1) + $table: Table(isOnPage(1), (containsString("Final Report") || containsString("SPL"))) then - entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $table.getCell(0, 0)).findFirst().ifPresent(entity -> { + entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $table).findFirst().ifPresent(entity -> { entity.apply("DOC.7.1", "Title found", "n-a"); }); end diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/A21550L - Acute Inhalation - Rats.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/A21550L - Acute Inhalation - Rats.pdf new file mode 100644 index 00000000..bd683160 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/A21550L - Acute Inhalation - Rats.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf new file mode 100644 index 00000000..8d3864bb Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf new file mode 100644 index 00000000..34e1f804 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf differ