Pull request #532: RED-6619 1
Merge in RED/redaction-service from RED-6619_1 to master * commit 'aff7074b4012627a4e8b64b2f4a779b18feed50a': RED-6619 - renamed variables RED-6619 - reformat code RED-6619 - delete unnecessary import RED-6619 - moved not the logic to a boolean, but the 1 into a constant RED-6619 - fix integration-tests by adding versions and move the hasMinimumSize-logic into own boolea RED-6619 - added missing ' (typo) RED-6619 - add logic to ignore found table-cells with height or width < 1. Also: Fix the tests and add new segmentation-tests and 1 redaction-integration-test. Renamed the latter to fit maven regexp RED-6619 - add tests for table-extraction
This commit is contained in:
commit
4bf686a432
@ -25,6 +25,8 @@ public class Cell extends Rectangle {
|
||||
|
||||
private boolean isHeaderCell;
|
||||
|
||||
private static final int MIN_SIZE = 1;
|
||||
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
@ -66,4 +68,10 @@ public class Cell extends Rectangle {
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
|
||||
}
|
||||
|
||||
|
||||
public boolean hasMinimumSize() {
|
||||
|
||||
return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -261,7 +261,9 @@ public class Table extends AbstractTextContainer {
|
||||
if (intersectionCell.isPresent()) {
|
||||
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
|
||||
}
|
||||
row.add(cell);
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
}
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
@ -88,7 +88,7 @@ public class TableExtractionService {
|
||||
for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
|
||||
TextBlock textBlock = (TextBlock) abstractTextContainer;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.intersects(textBlock.getPdfMinX(),
|
||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
@ -109,7 +109,7 @@ public class TableExtractionService {
|
||||
|
||||
List<Cell> overlappingCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.intersects(area)) {
|
||||
if (c.hasMinimumSize() && c.intersects(area)) {
|
||||
overlappingCells.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3,20 +3,11 @@ package com.iqser.red.service.redaction.v1.server;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.kie.api.KieServices;
|
||||
import org.kie.api.builder.KieBuilder;
|
||||
import org.kie.api.builder.KieFileSystem;
|
||||
import org.kie.api.builder.KieModule;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
@ -42,32 +33,17 @@ import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(RedactionIntegrationTestV2.RedactionIntegrationTestConfiguration.class)
|
||||
public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest {
|
||||
@Import(RedactionIntegrationV2Test.RedactionIntegrationTestConfiguration.class)
|
||||
|
||||
public class RedactionIntegrationV2Test extends AbstractRedactionIntegrationTest {
|
||||
|
||||
private static final String RULES = loadFromClassPath("drools/rules_v2.drl");
|
||||
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
@ComponentScan(excludeFilters={@ComponentScan.Filter(type= FilterType.ASSIGNABLE_TYPE, value=StorageAutoConfiguration.class)})
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
static class RedactionIntegrationTestConfiguration {
|
||||
|
||||
@Bean
|
||||
public KieContainer kieContainer() {
|
||||
|
||||
KieServices kieServices = KieServices.Factory.get();
|
||||
|
||||
KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
|
||||
InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
|
||||
kieFileSystem.write("src/test/resources/drools/rules_v2", kieServices.getResources().newInputStreamResource(input));
|
||||
KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
|
||||
kieBuilder.buildAll();
|
||||
KieModule kieModule = kieBuilder.getKieModule();
|
||||
|
||||
return kieServices.newKieContainer(kieModule.getReleaseId());
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
@ -113,7 +89,7 @@ public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest
|
||||
/**
|
||||
* The case in this test: One term, 'Dr. Alan Miller', is found by PII-Rule and is in the PII-dictionary
|
||||
* as well as in the PII-false-positive-list - and in the CBI-author dictionary.
|
||||
* It gets redacted, as the PII-finding is false positive and so the CBI-author entry is effective
|
||||
* It gets redacted, as the PII-finding is false positive and so the CBI-author entry is effective,
|
||||
* independent of the entity-rank
|
||||
*/
|
||||
@Test
|
||||
@ -122,12 +98,20 @@ public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/new/simplified2.pdf");
|
||||
|
||||
dictionary.clear();
|
||||
dictionary.put(DICTIONARY_PII, Arrays.asList("Dr. Alan Miller"));
|
||||
dictionary.put(DICTIONARY_AUTHOR, Arrays.asList("Dr. Alan Miller"));
|
||||
String entryAuthorAndPIIDictionary = "Dr. Alan Miller";
|
||||
|
||||
falsePositive.clear();
|
||||
falsePositive.put(DICTIONARY_PII, Arrays.asList("Dr. Alan Miller COMPLETION DATE:"));
|
||||
dictionary.get(DICTIONARY_AUTHOR).add(entryAuthorAndPIIDictionary);
|
||||
dictionary.put(DICTIONARY_PII, List.of(entryAuthorAndPIIDictionary));
|
||||
|
||||
String entryPIIFalsePositive = "Dr. Alan Miller COMPLETION DATE:";
|
||||
|
||||
falsePositive.put(DICTIONARY_PII, List.of(entryPIIFalsePositive));
|
||||
|
||||
reanlysisVersions.put(entryAuthorAndPIIDictionary, 1L);
|
||||
reanlysisVersions.put(entryPIIFalsePositive, 1L);
|
||||
|
||||
when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L);
|
||||
mockDictionaryCalls(0L);
|
||||
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
analyzeService.analyze(request);
|
||||
@ -139,7 +123,45 @@ public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest
|
||||
RedactionLogEntry redactionLogEntry = redactionLog.getRedactionLogEntry().get(0);
|
||||
|
||||
assertThat(redactionLogEntry.getType()).isEqualTo(DICTIONARY_AUTHOR);
|
||||
assertThat(redactionLogEntry.getValue()).isEqualTo("Dr. Alan Miller");
|
||||
assertThat(redactionLogEntry.getValue()).isEqualTo(entryAuthorAndPIIDictionary);
|
||||
assertThat(redactionLogEntry.isRedacted()).isEqualTo(true);
|
||||
assertThat(redactionLogEntry.isRecommendation()).isEqualTo(false);
|
||||
assertThat(redactionLogEntry.isFalsePositive()).isEqualTo(false);
|
||||
assertThat(redactionLogEntry.isExcluded()).isEqualTo(false);
|
||||
assertThat(redactionLogEntry.isDictionaryEntry()).isEqualTo(true);
|
||||
|
||||
assertThat(redactionLogEntry.getEngines().size()).isEqualTo(1);
|
||||
assertThat(redactionLogEntry.getEngines().contains(Engine.DICTIONARY)).isEqualTo(true);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The case in this test: The term 'Evans P.G.' is very close to a table-cell. It will get redacted nevertheless.
|
||||
*/
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTermGetsRedactedEvenItsCloseToCellBorder() {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/SinglePages/VV-931175_Page1.pdf");
|
||||
|
||||
dictionary.clear();
|
||||
falsePositive.clear();
|
||||
|
||||
String entryAuthorDictionary = "Evans P.G.";
|
||||
dictionary.put(DICTIONARY_AUTHOR, List.of(entryAuthorDictionary));
|
||||
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
analyzeService.analyze(request);
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||
|
||||
assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(1);
|
||||
|
||||
RedactionLogEntry redactionLogEntry = redactionLog.getRedactionLogEntry().get(0);
|
||||
|
||||
assertThat(redactionLogEntry.getType()).isEqualTo(DICTIONARY_AUTHOR);
|
||||
assertThat(redactionLogEntry.getValue()).isEqualTo(entryAuthorDictionary);
|
||||
assertThat(redactionLogEntry.isRedacted()).isEqualTo(true);
|
||||
assertThat(redactionLogEntry.isRecommendation()).isEqualTo(false);
|
||||
assertThat(redactionLogEntry.isFalsePositive()).isEqualTo(false);
|
||||
@ -4,6 +4,7 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -92,7 +93,7 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
@Configuration
|
||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||
@ComponentScan(excludeFilters={@ComponentScan.Filter(type= FilterType.ASSIGNABLE_TYPE, value=StorageAutoConfiguration.class)})
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
public static class TestConfiguration {
|
||||
|
||||
@Bean
|
||||
@ -105,6 +106,14 @@ public class PdfSegmentationServiceTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void prepareStorage() {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(),
|
||||
RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
|
||||
new ClassPathResource("files/cv_service_empty_response.json").getInputStream());
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMapping() {
|
||||
@ -136,7 +145,7 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
@ -151,14 +160,14 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -170,14 +179,14 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -189,22 +198,406 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void prepareStorage() {
|
||||
@Test
|
||||
public void testDoc56Page170() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 7, 20, 0, 140);
|
||||
validateTable(document, 3, 8, 31, 0, 170);
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
|
||||
new ClassPathResource("files/cv_service_empty_response.json").getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testVV931175Page1() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 8, 9, 0, 2);
|
||||
|
||||
List<List<String>> values = Arrays.asList(
|
||||
Arrays.asList(
|
||||
"Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"
|
||||
),
|
||||
Arrays.asList(
|
||||
"",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"
|
||||
),
|
||||
Arrays.asList(
|
||||
"CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"
|
||||
)
|
||||
);
|
||||
|
||||
validateTable(document, 0, values);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc27Page6() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 2, 0, 0);
|
||||
validateTable(document, 1, 3, 2, 0, 0);
|
||||
validateTable(document, 2, 3, 3, 0, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocA20622APartB9Page185() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 23);
|
||||
validateTable(document, 1, 11, 9, 0, 36);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocA20622APartB7Page123() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 2, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 5, 0, 0);
|
||||
validateTable(document, 3, 2, 5, 0, 0);
|
||||
validateTable(document, 4, 2, 4, 0, 0);
|
||||
validateTable(document, 5, 2, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc77Page111() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 3);
|
||||
|
||||
validateTable(document, 0, 7, 9, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 10, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc95Page532() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 9, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc52Page175() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 6, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc52Page174() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 6, 7, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc19Page35() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc19Page161() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 2, 2, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc47Page30() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 7, 8, 1, 0);
|
||||
validateTable(document, 1, 7, 8, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc49Page61() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 4, 17, 0, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc81Page54() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 14, 4, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc88Page134() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 17, 3, 0);
|
||||
validateTable(document, 1, 5, 16, 2, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocThiabendazolePage18() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 4, 4, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 3, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoc15Page18() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 11, 8, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoc28Page23() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 2);
|
||||
validateTable(document, 1, 6, 8, 0, 1);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc24Page17() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 2, 0);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoc30Page5() throws IOException {
|
||||
|
||||
prepareStorage();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
|
||||
|
||||
Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 3, 5, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void validateTable(Document document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||
|
||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||
assertThat(table.getRowCount()).isEqualTo(rowCount);
|
||||
|
||||
}
|
||||
private void validateTable(Document document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
||||
|
||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||
Cell cell = rowsFlattened.get(i);
|
||||
String value = valuesFlattened.get(i);
|
||||
assertThat(cell.toString()).isEqualTo(value);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void validateTableSize(Document document, int tableSize) {
|
||||
|
||||
assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user