DM-307: Improved table merging #52

Merged
dominique.eiflaender1 merged 1 commits from DM307 into master 2023-07-14 15:57:31 +02:00
7 changed files with 25 additions and 8 deletions

View File

@ -16,6 +16,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
import lombok.AllArgsConstructor;
@ -35,6 +36,15 @@ public class RectangleTransformations {
return annotationPosition;
}
public static Rectangle2D abstractPageBlockBBox(List<AbstractPageBlock> abstractPageBlocks) {
return abstractPageBlocks.stream()
.map(abstractPageBlock -> new Rectangle2D.Double(abstractPageBlock.getMinX(),
abstractPageBlock.getMinY(),
abstractPageBlock.getWidth(),
abstractPageBlock.getHeight())).collect(new Rectangle2DBBoxCollector());
}
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {

View File

@ -14,6 +14,9 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class TableMergingUtility {
private static final double TABLE_ALIGNMENT_THRESHOLD = 2d;
public List<TablePageBlock> findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List<AbstractPageBlock> pageBlocks) {
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
@ -24,7 +27,8 @@ public class TableMergingUtility {
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
for (TablePageBlock consecutiveTable : consecutiveTables) {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
consecutiveTable)) {
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
} else {
break;
@ -34,6 +38,12 @@ public class TableMergingUtility {
}
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
}
private boolean hasTableHeader(TablePageBlock table) {
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);

View File

@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Disabled
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/402-16_Fantom_ToxicidadeCutaneaAguda.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf");
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
// "files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");

View File

@ -320,7 +320,7 @@ rule "DOC.7.0: study title by document structure"
$table: Table(isOnPage(1),
(containsString("Final Report") || containsString("SPL")),
numberOfRows == 1,
numberOfCols == 1)
numberOfCols == 1, getCell(0,0).streamChildren().count() == 3)
then
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
@ -331,12 +331,9 @@ rule "DOC.7.0: study title by document structure"
rule "DOC.7.1: study title"
when
$table: Table(isOnPage(1),
(containsString("Final Report") || containsString("SPL")),
numberOfRows == 1,
numberOfCols == 1)
$table: Table(isOnPage(1), (containsString("Final Report") || containsString("SPL")))
then
entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $table.getCell(0, 0)).findFirst().ifPresent(entity -> {
entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $table).findFirst().ifPresent(entity -> {
entity.apply("DOC.7.1", "Title found", "n-a");
});
end