Fix multi-page table merge
This commit is contained in:
parent
bb1112d0d7
commit
baa703928f
@ -42,13 +42,17 @@ public class SectionsBuilderService {
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline, previousTable);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
lastHeadline = current.getText();
|
||||
if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) {
|
||||
previousTable = chunkBlock.getTables().get(0);
|
||||
}
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
} else if (current instanceof Table) {
|
||||
Table table = (Table) current;
|
||||
// Distribute header information for subsequent tables
|
||||
mergeTableMetadata(table, previousTable);
|
||||
previousTable = table;
|
||||
}
|
||||
|
||||
chunkWords.add(current);
|
||||
prev = current;
|
||||
}
|
||||
@ -62,6 +66,34 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(Table currentTable, Table previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows()
|
||||
.get(0)
|
||||
.size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0);
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRows()
|
||||
.size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline, Table previousTable) {
|
||||
|
||||
Paragraph paragraph = new Paragraph();
|
||||
@ -73,6 +105,7 @@ public class SectionsBuilderService {
|
||||
Iterator<AbstractTextContainer> itty = wordBlockList.iterator();
|
||||
boolean alreadyAdded = false;
|
||||
AbstractTextContainer previous = null;
|
||||
Table sectionTable = previousTable;
|
||||
while (itty.hasNext()) {
|
||||
AbstractTextContainer container = itty.next();
|
||||
|
||||
@ -85,27 +118,8 @@ public class SectionsBuilderService {
|
||||
} else {
|
||||
table.setHeadline("Table in: " + lastHeadline);
|
||||
}
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(table);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0);
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size()
|
||||
&& row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
mergeTableMetadata(table, sectionTable);
|
||||
sectionTable = table;
|
||||
|
||||
if (textBlock != null && !alreadyAdded) {
|
||||
paragraph.getPageBlocks().add(textBlock);
|
||||
|
||||
@ -105,4 +105,44 @@ public class PdfSegmentationServiceTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMultiPageMetadataPropagation() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells)))
|
||||
.isTrue();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user