DM-307: Improved table merging #52
@ -16,6 +16,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -35,6 +36,15 @@ public class RectangleTransformations {
|
||||
return annotationPosition;
|
||||
}
|
||||
|
||||
public static Rectangle2D abstractPageBlockBBox(List<AbstractPageBlock> abstractPageBlocks) {
|
||||
|
||||
return abstractPageBlocks.stream()
|
||||
.map(abstractPageBlock -> new Rectangle2D.Double(abstractPageBlock.getMinX(),
|
||||
abstractPageBlock.getMinY(),
|
||||
abstractPageBlock.getWidth(),
|
||||
abstractPageBlock.getHeight())).collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
|
||||
@ -14,6 +14,9 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class TableMergingUtility {
|
||||
|
||||
private static final double TABLE_ALIGNMENT_THRESHOLD = 2d;
|
||||
|
||||
|
||||
public List<TablePageBlock> findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
|
||||
@ -24,7 +27,8 @@ public class TableMergingUtility {
|
||||
|
||||
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
|
||||
for (TablePageBlock consecutiveTable : consecutiveTables) {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
||||
consecutiveTable)) {
|
||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||
} else {
|
||||
break;
|
||||
@ -34,6 +38,12 @@ public class TableMergingUtility {
|
||||
}
|
||||
|
||||
|
||||
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
|
||||
|
||||
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasTableHeader(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);
|
||||
|
||||
@ -45,7 +45,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
|
||||
@Disabled
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/402-16_Fantom_ToxicidadeCutaneaAguda.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf");
|
||||
|
||||
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
|
||||
// "files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");
|
||||
|
||||
@ -320,7 +320,7 @@ rule "DOC.7.0: study title by document structure"
|
||||
$table: Table(isOnPage(1),
|
||||
(containsString("Final Report") || containsString("SPL")),
|
||||
numberOfRows == 1,
|
||||
numberOfCols == 1)
|
||||
numberOfCols == 1, getCell(0,0).streamChildren().count() == 3)
|
||||
then
|
||||
|
||||
entityCreationService.bySemanticNode($table.getCell(0, 0).streamChildren().toList().get(1), "title", EntityType.ENTITY).ifPresent(entity -> {
|
||||
@ -331,12 +331,9 @@ rule "DOC.7.0: study title by document structure"
|
||||
|
||||
rule "DOC.7.1: study title"
|
||||
when
|
||||
$table: Table(isOnPage(1),
|
||||
(containsString("Final Report") || containsString("SPL")),
|
||||
numberOfRows == 1,
|
||||
numberOfCols == 1)
|
||||
$table: Table(isOnPage(1), (containsString("Final Report") || containsString("SPL")))
|
||||
then
|
||||
entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $table.getCell(0, 0)).findFirst().ifPresent(entity -> {
|
||||
entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $table).findFirst().ifPresent(entity -> {
|
||||
entity.apply("DOC.7.1", "Title found", "n-a");
|
||||
});
|
||||
end
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user