diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 16017f95..0e84afa1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -42,13 +42,17 @@ public class SectionsBuilderService { Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline, previousTable); chunkBlock.setHeadline(lastHeadline); lastHeadline = current.getText(); - if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) { - previousTable = chunkBlock.getTables().get(0); - } chunkBlockList.add(chunkBlock); chunkWords = new ArrayList<>(); + if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) { + previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1); + } + } else if (current instanceof Table) { + Table table = (Table) current; + // Distribute header information for subsequent tables + mergeTableMetadata(table, previousTable); + previousTable = table; } - chunkWords.add(current); prev = current; } @@ -62,6 +66,34 @@ public class SectionsBuilderService { } + private void mergeTableMetadata(Table currentTable, Table previousTable) { + + // Distribute header information for subsequent tables + if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) { + List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); + List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); + // Allow merging of tables if header row is separated from first logical non-header row + if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows() + .get(0) + .size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows().get(0); + } + if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { + for (int i = currentTable.getRows() + .size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = currentTable.getRows().get(i); + if (row.size() == tableNonHeaderRow.size() && row.stream() + .allMatch(cell -> cell.getHeaderCells().isEmpty())) { + for (int j = 0; j < row.size(); j++) { + row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); + } + } + } + } + } + } + + private Paragraph buildTextBlock(List wordBlockList, String lastHeadline, Table previousTable) { Paragraph paragraph = new Paragraph(); @@ -73,6 +105,7 @@ public class SectionsBuilderService { Iterator itty = wordBlockList.iterator(); boolean alreadyAdded = false; AbstractTextContainer previous = null; + Table sectionTable = previousTable; while (itty.hasNext()) { AbstractTextContainer container = itty.next(); @@ -85,27 +118,8 @@ public class SectionsBuilderService { } else { table.setHeadline("Table in: " + lastHeadline); } - // Distribute header information for subsequent tables - if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) { - List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); - List tableNonHeaderRow = getRowWithNonHeaderCells(table); - // Allow merging of tables if header row is separated from first logical non-header row - if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 - && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { - previousTableNonHeaderRow = previousTable.getRows().get(0); - } - if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { - for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = table.getRows().get(i); - if (row.size() == tableNonHeaderRow.size() - && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { - for (int j = 0; j < row.size(); j++) { - row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); - } - } - } - } - } + mergeTableMetadata(table, sectionTable); + sectionTable = table; if (textBlock != null && !alreadyAdded) { paragraph.getPageBlocks().add(textBlock); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index f85dd8dd..19498e67 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -105,4 +105,44 @@ public class PdfSegmentationServiceTest { } } + + @Test + public void testMultiPageMetadataPropagation() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); + + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document document = pdfSegmentationService.parseDocument(pdDocument); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(9); + assertThat(firstTable.getRowCount()).isEqualTo(5); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(9); + assertThat(secondTable.getRowCount()).isEqualTo(6); + List> firstTableHeaderCells = firstTable.getRows() + .get(firstTable.getRowCount() - 1) + .stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()); + assertThat(secondTable.getRows().stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))) + .isTrue(); + } + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Merge Multi Page Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Merge Multi Page Table.pdf new file mode 100644 index 00000000..feba9632 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Merge Multi Page Table.pdf differ