Fix multi-page table merge

This commit is contained in:
Thierry Göckel 2020-08-24 13:03:03 +02:00
parent bb1112d0d7
commit baa703928f
3 changed files with 79 additions and 25 deletions

View File

@ -42,13 +42,17 @@ public class SectionsBuilderService {
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline, previousTable);
chunkBlock.setHeadline(lastHeadline);
lastHeadline = current.getText();
if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) {
previousTable = chunkBlock.getTables().get(0);
}
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) {
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
}
} else if (current instanceof Table) {
Table table = (Table) current;
// Distribute header information for subsequent tables
mergeTableMetadata(table, previousTable);
previousTable = table;
}
chunkWords.add(current);
prev = current;
}
@ -62,6 +66,34 @@ public class SectionsBuilderService {
}
private void mergeTableMetadata(Table currentTable, Table previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows()
.get(0)
.size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0);
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRows()
.size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline, Table previousTable) {
Paragraph paragraph = new Paragraph();
@ -73,6 +105,7 @@ public class SectionsBuilderService {
Iterator<AbstractTextContainer> itty = wordBlockList.iterator();
boolean alreadyAdded = false;
AbstractTextContainer previous = null;
Table sectionTable = previousTable;
while (itty.hasNext()) {
AbstractTextContainer container = itty.next();
@ -85,27 +118,8 @@ public class SectionsBuilderService {
} else {
table.setHeadline("Table in: " + lastHeadline);
}
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(table);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1
&& previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0);
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if (row.size() == tableNonHeaderRow.size()
&& row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
mergeTableMetadata(table, sectionTable);
sectionTable = table;
if (textBlock != null && !alreadyAdded) {
paragraph.getPageBlocks().add(textBlock);

View File

@ -105,4 +105,44 @@ public class PdfSegmentationServiceTest {
}
}
@Test
public void testMultiPageMetadataPropagation() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows().stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells)))
.isTrue();
}
}
}