Pull request #532: RED-6619 1

Merge in RED/redaction-service from RED-6619_1 to master * commit 'aff7074b4012627a4e8b64b2f4a779b18feed50a': RED-6619 - renamed variables RED-6619 - reformat code RED-6619 - delete unnecessary import RED-6619 - moved not the logic to a boolean, but the 1 into a constant RED-6619 - fix integration-tests by adding versions and move the hasMinimumSize-logic into own boolea RED-6619 - added missing ' (typo) RED-6619 - add logic to ignore found table-cells with height or width < 1. Also: Fix the tests and add new segmentation-tests and 1 redaction-integration-test. Renamed the latter to fit maven regexp RED-6619 - add tests for table-extraction
2023-05-02 10:54:30 +02:00 · 2023-05-02 10:54:30 +02:00 · 4bf686a432
commit 4bf686a432
parent 2646407805 aff7074b40
25 changed files with 8011 additions and 53 deletions
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java
@ -25,6 +25,8 @@ public class Cell extends Rectangle {

    private boolean isHeaderCell;

+    private static final int MIN_SIZE = 1;
+

    public Cell(Point2D topLeft, Point2D bottomRight) {

@ -66,4 +68,10 @@ public class Cell extends Rectangle {
        return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
    }

+
+    public boolean hasMinimumSize() {
+
+        return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE;
+    }
+
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java
@ -261,7 +261,9 @@ public class Table extends AbstractTextContainer {
                    if (intersectionCell.isPresent()) {
                        cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
                    }
-                    row.add(cell);
+                    if (cell.hasMinimumSize()) {
+                        row.add(cell);
+                    }
                }
                prevX = x;
            }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/TableExtractionService.java
@ -88,7 +88,7 @@ public class TableExtractionService {
        for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
            TextBlock textBlock = (TextBlock) abstractTextContainer;
            for (Cell cell : cells) {
-                if (cell.intersects(textBlock.getPdfMinX(),
+                if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
                        textBlock.getPdfMinY(),
                        textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
                        textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
@ -109,7 +109,7 @@ public class TableExtractionService {

            List<Cell> overlappingCells = new ArrayList<>();
            for (Cell c : cells) {
-                if (c.intersects(area)) {
+                if (c.hasMinimumSize() && c.intersects(area)) {
                    overlappingCells.add(c);
                }
            }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationV2Test.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationV2Test.java
@ -3,20 +3,11 @@ package com.iqser.red.service.redaction.v1.server;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.mockito.Mockito.when;

-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
 import java.util.List;

 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
-import org.kie.api.KieServices;
-import org.kie.api.builder.KieBuilder;
-import org.kie.api.builder.KieFileSystem;
-import org.kie.api.builder.KieModule;
-import org.kie.api.runtime.KieContainer;
 import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
 import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
 import org.springframework.boot.test.context.SpringBootTest;
@ -42,32 +33,17 @@ import lombok.SneakyThrows;

@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
-@Import(RedactionIntegrationTestV2.RedactionIntegrationTestConfiguration.class)
-public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest {
+@Import(RedactionIntegrationV2Test.RedactionIntegrationTestConfiguration.class)
+
+public class RedactionIntegrationV2Test extends AbstractRedactionIntegrationTest {

    private static final String RULES = loadFromClassPath("drools/rules_v2.drl");

    @Configuration
    @EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
-    @ComponentScan(excludeFilters={@ComponentScan.Filter(type= FilterType.ASSIGNABLE_TYPE, value=StorageAutoConfiguration.class)})
+    @ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
    static class RedactionIntegrationTestConfiguration {

-        @Bean
-        public KieContainer kieContainer() {
-
-            KieServices kieServices = KieServices.Factory.get();
-
-            KieFileSystem kieFileSystem = kieServices.newKieFileSystem();
-            InputStream input = new ByteArrayInputStream(RULES.getBytes(StandardCharsets.UTF_8));
-            kieFileSystem.write("src/test/resources/drools/rules_v2", kieServices.getResources().newInputStreamResource(input));
-            KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem);
-            kieBuilder.buildAll();
-            KieModule kieModule = kieBuilder.getKieModule();
-
-            return kieServices.newKieContainer(kieModule.getReleaseId());
-        }
-
-
        @Bean
        @Primary
        public StorageService inmemoryStorage() {
@ -113,7 +89,7 @@ public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest
    /**
     * The case in this test: One term, 'Dr. Alan Miller', is found by PII-Rule and is in the PII-dictionary
     * as well as in the PII-false-positive-list - and in the CBI-author dictionary.
-     * It gets redacted, as the PII-finding is false positive and so the CBI-author entry is effective
+     * It gets redacted, as the PII-finding is false positive and so the CBI-author entry is effective,
     * independent of the entity-rank
     */
    @Test
@ -122,12 +98,20 @@ public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest

        AnalyzeRequest request = uploadFileToStorage("files/new/simplified2.pdf");

-        dictionary.clear();
-        dictionary.put(DICTIONARY_PII, Arrays.asList("Dr. Alan Miller"));
-        dictionary.put(DICTIONARY_AUTHOR, Arrays.asList("Dr. Alan Miller"));
+        String entryAuthorAndPIIDictionary = "Dr. Alan Miller";

-        falsePositive.clear();
-        falsePositive.put(DICTIONARY_PII,  Arrays.asList("Dr. Alan Miller COMPLETION DATE:"));
+        dictionary.get(DICTIONARY_AUTHOR).add(entryAuthorAndPIIDictionary);
+        dictionary.put(DICTIONARY_PII, List.of(entryAuthorAndPIIDictionary));
+
+        String entryPIIFalsePositive = "Dr. Alan Miller COMPLETION DATE:";
+
+        falsePositive.put(DICTIONARY_PII, List.of(entryPIIFalsePositive));
+
+        reanlysisVersions.put(entryAuthorAndPIIDictionary, 1L);
+        reanlysisVersions.put(entryPIIFalsePositive, 1L);
+
+        when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(3L);
+        mockDictionaryCalls(0L);

        analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
        analyzeService.analyze(request);
@ -139,7 +123,45 @@ public class RedactionIntegrationTestV2 extends AbstractRedactionIntegrationTest
        RedactionLogEntry redactionLogEntry = redactionLog.getRedactionLogEntry().get(0);

        assertThat(redactionLogEntry.getType()).isEqualTo(DICTIONARY_AUTHOR);
-        assertThat(redactionLogEntry.getValue()).isEqualTo("Dr. Alan Miller");
+        assertThat(redactionLogEntry.getValue()).isEqualTo(entryAuthorAndPIIDictionary);
+        assertThat(redactionLogEntry.isRedacted()).isEqualTo(true);
+        assertThat(redactionLogEntry.isRecommendation()).isEqualTo(false);
+        assertThat(redactionLogEntry.isFalsePositive()).isEqualTo(false);
+        assertThat(redactionLogEntry.isExcluded()).isEqualTo(false);
+        assertThat(redactionLogEntry.isDictionaryEntry()).isEqualTo(true);
+
+        assertThat(redactionLogEntry.getEngines().size()).isEqualTo(1);
+        assertThat(redactionLogEntry.getEngines().contains(Engine.DICTIONARY)).isEqualTo(true);
+
+    }
+
+
+    /**
+     * The case in this test: The term 'Evans P.G.' is very close to a table-cell. It will get redacted nevertheless.
+     */
+    @Test
+    @SneakyThrows
+    public void testTermGetsRedactedEvenItsCloseToCellBorder() {
+
+        AnalyzeRequest request = uploadFileToStorage("files/SinglePages/VV-931175_Page1.pdf");
+
+        dictionary.clear();
+        falsePositive.clear();
+
+        String entryAuthorDictionary = "Evans P.G.";
+        dictionary.put(DICTIONARY_AUTHOR, List.of(entryAuthorDictionary));
+
+        analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
+        analyzeService.analyze(request);
+
+        var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
+
+        assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(1);
+
+        RedactionLogEntry redactionLogEntry = redactionLog.getRedactionLogEntry().get(0);
+
+        assertThat(redactionLogEntry.getType()).isEqualTo(DICTIONARY_AUTHOR);
+        assertThat(redactionLogEntry.getValue()).isEqualTo(entryAuthorDictionary);
        assertThat(redactionLogEntry.isRedacted()).isEqualTo(true);
        assertThat(redactionLogEntry.isRecommendation()).isEqualTo(false);
        assertThat(redactionLogEntry.isFalsePositive()).isEqualTo(false);
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
@ -4,6 +4,7 @@ import static org.assertj.core.api.Assertions.assertThat;

 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
@ -92,7 +93,7 @@ public class PdfSegmentationServiceTest {

    @Configuration
    @EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
-    @ComponentScan(excludeFilters={@ComponentScan.Filter(type= FilterType.ASSIGNABLE_TYPE, value=StorageAutoConfiguration.class)})
+    @ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
    public static class TestConfiguration {

        @Bean
@ -105,6 +106,14 @@ public class PdfSegmentationServiceTest {
    }


+    @SneakyThrows
+    private void prepareStorage() {
+
+        storageService.storeObject(TenantContext.getTenantId(),
+                RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
+                new ClassPathResource("files/cv_service_empty_response.json").getInputStream());
+    }
+
    @Test
    @SneakyThrows
    public void testMapping() {
@ -136,7 +145,7 @@ public class PdfSegmentationServiceTest {

        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
        assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
-        Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
+        Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
        assertThat(table.getColCount()).isEqualTo(6);
        assertThat(table.getRowCount()).isEqualTo(13);
        assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
@ -151,14 +160,14 @@ public class PdfSegmentationServiceTest {

        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
        assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
-        Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
+        Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
        assertThat(firstTable.getColCount()).isEqualTo(8);
        assertThat(firstTable.getRowCount()).isEqualTo(1);
-        Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
+        Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
        assertThat(secondTable.getColCount()).isEqualTo(8);
        assertThat(secondTable.getRowCount()).isEqualTo(2);
        List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
-        assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
+        assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
    }


@ -170,14 +179,14 @@ public class PdfSegmentationServiceTest {

        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
        assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
-        Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
+        Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
        assertThat(firstTable.getColCount()).isEqualTo(9);
        assertThat(firstTable.getRowCount()).isEqualTo(5);
-        Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
+        Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
        assertThat(secondTable.getColCount()).isEqualTo(9);
        assertThat(secondTable.getRowCount()).isEqualTo(6);
        List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
-        assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
+        assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
    }


@ -189,22 +198,406 @@ public class PdfSegmentationServiceTest {

        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
        assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
-        Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
+        Table firstTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
        assertThat(firstTable.getColCount()).isEqualTo(8);
        assertThat(firstTable.getRowCount()).isEqualTo(1);
-        Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
+        Table secondTable = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
        assertThat(secondTable.getColCount()).isEqualTo(8);
        assertThat(secondTable.getRowCount()).isEqualTo(6);
        List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
-        assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).collect(Collectors.toList()).equals(firstTableHeaderCells))).isTrue();
+        assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
    }


-    @SneakyThrows
-    private void prepareStorage() {
+    @Test
+    public void testDoc56Page170() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 4);
+
+        validateTable(document, 0, 1, 1, 0, 0);
+        validateTable(document, 1, 2, 2, 0, 0);
+        validateTable(document, 2, 7, 20, 0, 140);
+        validateTable(document, 3, 8, 31, 0, 170);

-        storageService.storeObject(TenantContext.getTenantId(), RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.TABLES),
-                new ClassPathResource("files/cv_service_empty_response.json").getInputStream());
    }

+
+    @Test
+    public void testVV931175Page1() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+
+        validateTable(document, 0, 8, 9, 0, 2);
+
+        List<List<String>> values = Arrays.asList(
+                Arrays.asList(
+                "Annex point Reference within DAR/RAR",
+                        "Author, date",
+                        "Study title",
+                        "Analytical method Author, date, No.",
+                        "Technique, LOQ of the method, validated working range",
+                        "Method meets analytical validation criteria",
+                        "Remarks (in case validation criteria are not met)",
+                        "Acceptability of the method"
+                ),
+                Arrays.asList(
+                "",
+                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
+                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
+                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
+                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
+                        "",
+                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
+                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"
+                ),
+                Arrays.asList(
+                        "CA 7.1.2.1.1 DAR (2009)",
+                        "Evans P.G. 2001 TMJ4569B, VV-323245",
+                        "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
+                        "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
+                        "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
+                        "Y",
+                        "N/A",
+                        "Y"
+                )
+        );
+
+        validateTable(document, 0, values);
+
+    }
+
+
+    @Test
+    public void testDoc27Page6() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 4);
+
+        validateTable(document, 0, 3, 2, 0, 0);
+        validateTable(document, 1, 3, 2, 0, 0);
+        validateTable(document, 2, 3, 3, 0, 0);
+        validateTable(document, 3, 3, 3, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDocA20622APartB9Page185() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+
+        validateTable(document, 0, 5, 5, 0, 23);
+        validateTable(document, 1, 11, 9, 0, 36);
+
+    }
+
+
+    @Test
+    public void testDocA20622APartB7Page123() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 6);
+
+        validateTable(document, 0, 2, 1, 0, 0);
+        validateTable(document, 1, 2, 1, 0, 0);
+        validateTable(document, 2, 2, 5, 0, 0);
+        validateTable(document, 3, 2, 5, 0, 0);
+        validateTable(document, 4, 2, 4, 0, 0);
+        validateTable(document, 5, 2, 1, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDoc77Page111() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 3);
+
+        validateTable(document, 0, 7, 9, 0, 0);
+        validateTable(document, 1, 2, 1, 0, 0);
+        validateTable(document, 2, 2, 10, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDoc95Page532() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+        validateTable(document, 0, 9, 9, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDoc52Page175() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+
+        validateTable(document, 0, 9, 5, 6, 0);
+
+    }
+
+
+    @Test
+    public void testDoc52Page174() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+        validateTable(document, 0, 9, 6, 7, 0);
+
+    }
+
+
+    @Test
+    public void testDoc19Page35() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+        validateTable(document, 0, 10, 6, 0, 1);
+
+    }
+
+
+    @Test
+    public void testDoc19Page161() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+        validateTable(document, 0, 2, 2, 0, 0);
+        validateTable(document, 1, 1, 1, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDoc47Page30() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource(
+                "files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+
+        validateTable(document, 0, 7, 8, 1, 0);
+        validateTable(document, 1, 7, 8, 1, 0);
+
+    }
+
+
+    @Test
+    public void testDoc49Page61() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource(
+                "files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+
+        validateTable(document, 0, 4, 17, 0, 0);
+        validateTable(document, 1, 7, 12, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDoc81Page54() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+
+        validateTable(document, 0, 5, 14, 4, 0);
+        validateTable(document, 1, 7, 12, 0, 0);
+
+    }
+
+
+    @Test
+    public void testDoc88Page134() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+
+        validateTable(document, 0, 5, 17, 3, 0);
+        validateTable(document, 1, 5, 16, 2, 0);
+
+    }
+
+
+    @Test
+    public void testDocThiabendazolePage18() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 4);
+
+        validateTable(document, 0, 4, 4, 0, 0);
+        validateTable(document, 1, 1, 1, 0, 0);
+        validateTable(document, 2, 2, 3, 0, 0);
+        validateTable(document, 3, 1, 1, 0, 0);
+
+    }
+
+    @Test
+    public void testDoc15Page18() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+
+        validateTable(document, 0, 11, 8, 0, 0);
+
+    }
+
+    @Test
+    public void testDoc28Page23() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 2);
+
+        validateTable(document, 0, 6, 8, 0, 2);
+        validateTable(document, 1, 6, 8, 0, 1);
+
+
+    }
+
+
+    @Test
+    public void testDoc24Page17() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+
+        validateTable(document, 0, 9, 5, 2, 0);
+
+    }
+
+    @Test
+    public void testDoc30Page5() throws IOException {
+
+        prepareStorage();
+        ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
+
+        Document document = pdfSegmentationService.parseDocument(TEST_DOSSIER_ID, TEST_FILE_ID, pdfFileResource.getInputStream(), null);
+
+        validateTableSize(document, 1);
+
+        validateTable(document, 0, 3, 5, 0, 0);
+
+    }
+
+
+
+    private void validateTable(Document document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
+
+        Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
+        List<List<Cell>> rows = table.getRows();
+        int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
+
+        assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
+
+        assertThat(table.getColCount()).isEqualTo(colCount);
+        assertThat(table.getRowCount()).isEqualTo(rowCount);
+
+    }
+    private void validateTable(Document document, int tableIndex, List<List<String>> values) {
+
+        Table table = document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
+        List<List<Cell>> rows = table.getRows();
+
+        List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
+        List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
+
+        for (int i = 0; i < valuesFlattened.size(); i++) {
+            Cell cell = rowsFlattened.get(i);
+            String value = valuesFlattened.get(i);
+            assertThat(cell.toString()).isEqualTo(value);
+        }
+
+    }
+
+    private void validateTableSize(Document document, int tableSize) {
+
+        assertThat(document.getParagraphs().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
+
+    }
+
+
+
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/15
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/15
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/19
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/19
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/19
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/19
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/24
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/24
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/27
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/27
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/28
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/28
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/30
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/30
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/47
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/47
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/49
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/49
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/52
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/52
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/52
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/52
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/56
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/56
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/77
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/77
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/81
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/81
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/85
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/85
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/95
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/95
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/A20622A
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/A20622A
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/A20622A
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/A20622A
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/Thiabendazole
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/Thiabendazole
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/VV-931175_Page1.pdf
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/SinglePages/VV-931175_Page1.pdf