diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index a5893161..16017f95 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -89,10 +89,15 @@ public class SectionsBuilderService { if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable)) { List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List tableNonHeaderRow = getRowWithNonHeaderCells(table); + // Allow merging of tables if header row is separated from first logical non-header row + if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 + && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows().get(0); + } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = table.getRows().size() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table List row = table.getRows().get(i); - if (row.size() == previousTableNonHeaderRow.size() + if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index 537fa91b..f85dd8dd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -18,6 +18,7 @@ import org.springframework.test.context.junit4.SpringRunner; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; @@ -64,4 +65,44 @@ public class PdfSegmentationServiceTest { } } + + @Test + public void testTableExtraction() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); + + try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { + Document document = pdfSegmentationService.parseDocument(pdDocument); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(8); + assertThat(firstTable.getRowCount()).isEqualTo(1); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(8); + assertThat(secondTable.getRowCount()).isEqualTo(2); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()); + assertThat(secondTable.getRows().stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))) + .isTrue(); + } + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Merge Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Merge Table.pdf new file mode 100644 index 00000000..64e29896 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Merge Table.pdf differ