RED-6619 - add tests for table-extraction

This commit is contained in:
Thomas Beyer 2023-04-27 14:04:22 +02:00
parent 2646407805
commit 6f783a9f00
17 changed files with 7818 additions and 12 deletions

View File

@ -92,7 +92,7 @@ public class PdfSegmentationServiceTest {
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
@ComponentScan(excludeFilters={@ComponentScan.Filter(type= FilterType.ASSIGNABLE_TYPE, value=StorageAutoConfiguration.class)})
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
public static class TestConfiguration {
@Bean
@ -136,7 +136,7 @@ public class PdfSegmentationServiceTest {
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
@ -151,14 +151,14 @@ public class PdfSegmentationServiceTest {
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
}
@ -170,14 +170,14 @@ public class PdfSegmentationServiceTest {
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
}
@ -189,21 +189,294 @@ public class PdfSegmentationServiceTest {
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
}
@Test
public void testDoc56Page170() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 4);
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 3, 2, 0, 2);
validateTable(document, 2, 9, 20, 0, 180);
validateTable(document, 3, 11, 31, 0, 263);
}
@Test
public void testVV931175Page1() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 1);
validateTable(document, 0, 15, 9, 0, 74);
}
@Test
public void testDoc27Page6() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 4);
validateTable(document, 0, 3, 2, 0, 0);
validateTable(document, 1, 4, 2, 0, 2);
validateTable(document, 2, 3, 3, 0, 0);
validateTable(document, 3, 4, 3, 0, 3);
}
@Test
public void testDocA20622APartB9Page185() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 2);
validateTable(document, 0, 5, 5, 0, 23);
validateTable(document, 1, 11, 9, 0, 36);
}
@Test
public void testDocA20622APartB7Page123() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 6);
validateTable(document, 0, 3, 1, 0, 1);
validateTable(document, 1, 3, 1, 0, 1);
validateTable(document, 2, 3, 5, 0, 5);
validateTable(document, 3, 3, 5, 0, 5);
validateTable(document, 4, 3, 4, 0, 4);
validateTable(document, 5, 3, 1, 0, 1);
}
@Test
public void testDoc77Page111() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 3);
validateTable(document, 0, 7, 9, 0, 0);
validateTable(document, 1, 2, 1, 0, 0);
validateTable(document, 2, 2, 10, 0, 0);
}
@Test
public void testDoc95Page532() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 1);
validateTable(document, 0, 9, 9, 0, 0);
}
@Test
public void testDoc52Page175() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 1);
validateTable(document, 0, 10, 5, 6, 5);
}
@Test
public void testDoc52Page174() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 1);
validateTable(document, 0, 9, 6, 7, 0);
}
@Test
public void testDoc19Page35() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 1);
validateTable(document, 0, 10, 6, 0, 1);
}
@Test
public void testDoc19Page161() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 2);
validateTable(document, 0, 2, 2, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
}
@Test
public void testDoc47Page30() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource(
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 2);
validateTable(document, 0, 7, 8, 1, 0);
validateTable(document, 1, 7, 8, 1, 0);
}
@Test
public void testDoc49Page61() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource(
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 2);
validateTable(document, 0, 4, 17, 0, 0);
validateTable(document, 1, 8, 12, 0, 12);
}
@Test
public void testDoc81Page54() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 2);
validateTable(document, 0, 5, 14, 4, 0);
validateTable(document, 1, 7, 12, 0, 0);
}
@Test
public void testDoc88Page134() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 2);
validateTable(document, 0, 5, 17, 3, 0);
validateTable(document, 1, 5, 16, 2, 0);
}
@Test
public void testDocThiabendazolePage18() throws IOException {
prepareStorage();
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
validateTableSize(document, 4);
validateTable(document, 0, 4, 4, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
validateTable(document, 2, 2, 3, 0, 0);
validateTable(document, 3, 1, 1, 0, 0);
}
private void validateTable(Document document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsCoundFound = 0;
for (List<Cell> entry : rows) {
for (Cell cell : entry) {
if (cell.toString().equals("")) {
emptyCellsCoundFound++;
}
}
}
assertThat(emptyCellsCoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
assertThat(table.getColCount()).isEqualTo(colCount);
assertThat(table.getRowCount()).isEqualTo(rowCount);
}
private void validateTableSize(Document document, int tableSize) {
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
}
@SneakyThrows
private void prepareStorage() {
storageService.storeObject(TenantContext.getTenantId(), RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
storageService.storeObject(TenantContext.getTenantId(),
RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
new ClassPathResource("files/cv_service_empty_response.json").getInputStream());
}