RED-6619 - add tests for table-extraction
This commit is contained in:
parent
2646407805
commit
6f783a9f00
@ -92,7 +92,7 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
@ComponentScan(excludeFilters={@ComponentScan.Filter(type= FilterType.ASSIGNABLE_TYPE, value=StorageAutoConfiguration.class)})
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
public static class TestConfiguration {
|
||||
|
||||
@Bean
|
||||
@ -136,7 +136,7 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
@ -151,14 +151,14 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -170,14 +170,14 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -189,21 +189,294 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc56Page170() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 3, 2, 0, 2);
|
||||
validateTable(document, 2, 9, 20, 0, 180);
|
||||
validateTable(document, 3, 11, 31, 0, 263);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testVV931175Page1() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 15, 9, 0, 74);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc27Page6() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 2, 0, 0);
|
||||
validateTable(document, 1, 4, 2, 0, 2);
|
||||
validateTable(document, 2, 3, 3, 0, 0);
|
||||
validateTable(document, 3, 4, 3, 0, 3);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocA20622APartB9Page185() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 23);
|
||||
validateTable(document, 1, 11, 9, 0, 36);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocA20622APartB7Page123() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 3, 1, 0, 1);
|
||||
validateTable(document, 1, 3, 1, 0, 1);
|
||||
validateTable(document, 2, 3, 5, 0, 5);
|
||||
validateTable(document, 3, 3, 5, 0, 5);
|
||||
validateTable(document, 4, 3, 4, 0, 4);
|
||||
validateTable(document, 5, 3, 1, 0, 1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc77Page111() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 3);
|
||||
|
||||
validateTable(document, 0, 7, 9, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 10, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc95Page532() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 9, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc52Page175() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 10, 5, 6, 5);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc52Page174() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 6, 7, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc19Page35() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc19Page161() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 2, 2, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc47Page30() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 7, 8, 1, 0);
|
||||
validateTable(document, 1, 7, 8, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc49Page61() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 4, 17, 0, 0);
|
||||
validateTable(document, 1, 8, 12, 0, 12);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc81Page54() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 5, 14, 4, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc88Page134() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 5, 17, 3, 0);
|
||||
validateTable(document, 1, 5, 16, 2, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocThiabendazolePage18() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
validateTableSize(document, 4);
|
||||
validateTable(document, 0, 4, 4, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 3, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void validateTable(Document document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
int emptyCellsCoundFound = 0;
|
||||
for (List<Cell> entry : rows) {
|
||||
for (Cell cell : entry) {
|
||||
if (cell.toString().equals("")) {
|
||||
emptyCellsCoundFound++;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertThat(emptyCellsCoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||
|
||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||
assertThat(table.getRowCount()).isEqualTo(rowCount);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void validateTableSize(Document document, int tableSize) {
|
||||
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void prepareStorage() {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
|
||||
storageService.storeObject(TenantContext.getTenantId(),
|
||||
RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
|
||||
new ClassPathResource("files/cv_service_empty_response.json").getInputStream());
|
||||
}
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user