Merge branch 'hotfix-bp' into 'release/0.159.x'

hotfix: unmerge super large tables

See merge request fforesight/layout-parser!219
This commit is contained in:
Kilian Schüttler 2024-09-05 15:05:11 +02:00
commit f6c60aa5eb
3 changed files with 9 additions and 25 deletions

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
@ -185,12 +186,8 @@ public class TOCEnrichmentService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
@ -201,8 +198,7 @@ public class TOCEnrichmentService {
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows()
.get(i);
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
@ -225,18 +221,15 @@ public class TOCEnrichmentService {
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
.flatMap(Collection::stream)
.allMatch(cell -> cell.getHeaderCells().isEmpty());
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows()
.get(i);
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}

View File

@ -36,10 +36,7 @@ public class TableNodeFactory {
Document document) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream()
.map(AbstractPageBlock::getPage)
.map(context::getPage)
.collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream()
.map(TablePageBlock::getRows)
.flatMap(Collection::stream)

View File

@ -36,7 +36,7 @@ public class TableMergingUtility {
TablePageBlock consecutiveTable = consecutiveTables.get(i);
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() //
&& headersMatch(originalTablePageBlock, consecutiveTable) //
&& getHeaders(consecutiveTable).isEmpty() //
&& outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) //
&& consecutiveOrSamePage(currentTable, consecutiveTable) //
&& !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) {
@ -80,12 +80,6 @@ public class TableMergingUtility {
}
private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) {
return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable));
}
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD