From 09c18c110ae3c76f07dc4453ace9cb430c9ea0de Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 5 Sep 2024 14:26:45 +0200 Subject: [PATCH 1/2] hotfix: unmerge super large tables --- .../model/outline/TOCEnrichmentService.java | 21 +++++++------------ .../services/factory/TableNodeFactory.java | 5 +---- .../processor/utils/TableMergingUtility.java | 8 +------ 3 files changed, 9 insertions(+), 25 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java index 71da252..f64fbef 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -185,12 +186,8 @@ public class TOCEnrichmentService { List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); // Allow merging of tables if header row is separated from first logical non-header row - if (previousTableNonHeaderRow.isEmpty() - && previousTable.getRowCount() == 1 - && previousTable.getRows() - .get(0).size() == tableNonHeaderRow.size()) { - previousTableNonHeaderRow = previousTable.getRows() - .get(0) + if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows().get(0) .stream() .map(cell -> { Cell fakeCell = Cell.copy(cell); @@ -201,8 +198,7 @@ public class TOCEnrichmentService { } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = currentTable.getRows() - .get(i); + List row = currentTable.getRows().get(i); if (row.size() == tableNonHeaderRow.size() && row.stream() .allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { @@ -225,18 +221,15 @@ public class TOCEnrichmentService { return table.getRows() .stream() - .flatMap(row -> row.stream() - .filter(cell -> !cell.getHeaderCells().isEmpty())) - .findAny().isEmpty(); - + .flatMap(Collection::stream) + .anyMatch(cell -> !cell.getHeaderCells().isEmpty()); } private List getRowWithNonHeaderCells(TablePageBlock table) { for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = table.getRows() - .get(i); + List row = table.getRows().get(i); if (row.size() == 1) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index e14075e..8d06a29 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -36,10 +36,7 @@ public class TableNodeFactory { Document document) { setPageNumberInCells(tablesToMerge); - Set pages = tablesToMerge.stream() - .map(AbstractPageBlock::getPage) - .map(context::getPage) - .collect(Collectors.toSet()); + List> mergedRows = tablesToMerge.stream() .map(TablePageBlock::getRows) .flatMap(Collection::stream) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java index 3f9a92b..d33b63c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java @@ -36,7 +36,7 @@ public class TableMergingUtility { TablePageBlock consecutiveTable = consecutiveTables.get(i); if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() // - && headersMatch(originalTablePageBlock, consecutiveTable) // + && getHeaders(consecutiveTable).isEmpty() // && outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) // && consecutiveOrSamePage(currentTable, consecutiveTable) // && !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) { @@ -80,12 +80,6 @@ public class TableMergingUtility { } - private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) { - - return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable)); - } - - private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) { return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD From 90a1187921a8bac6d134bd971e22677acbd09bad Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 5 Sep 2024 14:50:19 +0200 Subject: [PATCH 2/2] hotfix: unmerge super large tables --- .../processor/model/outline/TOCEnrichmentService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java index f64fbef..72bad8b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -222,7 +222,7 @@ public class TOCEnrichmentService { return table.getRows() .stream() .flatMap(Collection::stream) - .anyMatch(cell -> !cell.getHeaderCells().isEmpty()); + .allMatch(cell -> cell.getHeaderCells().isEmpty()); }