"fixed" memory issues by calling GC manually, removing soft reference cache and disposing images properly

2021-04-16 23:13:12 +03:00 · 2021-04-16 23:13:12 +03:00 · 8060e3a29f
commit 8060e3a29f
parent 4749858e80
12 changed files with 354 additions and 263 deletions
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java
@ -18,6 +18,7 @@ import org.springframework.context.annotation.Import;
 public class Application {

    public static void main(String[] args) {
+        System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
        SpringApplication.run(Application.class, args);
    }

--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java
@ -79,50 +79,61 @@ public class RedactionController implements RedactionResource {
    @Override
    public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
        var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
+        try {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);

-        try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
-            pdDocument.setAllSecurityToBeRemoved(true);
-            pdDocument.setResourceCache(null);
+            storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
+            try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
+                pdDocument.setAllSecurityToBeRemoved(true);

-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
+                pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);

-            return convert(pdDocument, classifiedDoc.getPages().size());
+                return convert(pdDocument, classifiedDoc.getPages().size());
+
+            } catch (IOException e) {
+                throw new RedactionException(e);
+            }

        } catch (IOException e) {
            throw new RedactionException(e);
        }

+
    }

    @Override
    public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
        var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
+        try {
+            Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);

-        try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
-            pdDocument.setAllSecurityToBeRemoved(true);
+            storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
+            try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
+                pdDocument.setAllSecurityToBeRemoved(true);

-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
+                pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
+                return convert(pdDocument, classifiedDoc.getPages().size());

-            return convert(pdDocument, classifiedDoc.getPages().size());
+            } catch (IOException e) {
+                throw new RedactionException(e);
+            }

        } catch (IOException e) {
            throw new RedactionException(e);
        }

+
    }


    @Override
    public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {

-        var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
-
        Document classifiedDoc;
-        try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
-            pdDocument.setAllSecurityToBeRemoved(true);
-            classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+
+        try {
+            var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
+            classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
        } catch (Exception e) {
            throw new RedactionException(e);
        }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java
@ -0,0 +1,52 @@
+package com.iqser.red.service.redaction.v1.server.memory;
+
+import lombok.extern.slf4j.Slf4j;
+
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+
+@Slf4j
+public class MemoryStats {
+
+
+    public static void printMemoryStats() {
+        log.info("\n\n ------------------------------ \n" +
+                " Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" +
+                " Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" +
+                " Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" +
+                " Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" +
+                "\n ------------------------------ \n");
+    }
+
+
+    public static String humanReadableByteCountBin(long bytes) {
+        long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes);
+        if (absB < 1024) {
+            return bytes + " B";
+        }
+        long value = absB;
+        CharacterIterator ci = new StringCharacterIterator("KMGTPE");
+        for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) {
+            value >>= 10;
+            ci.next();
+        }
+        value *= Long.signum(bytes);
+        return String.format("%.1f %ciB", value / 1024.0, ci.current());
+    }
+
+    private static long getMaxMemory() {
+        return Runtime.getRuntime().maxMemory();
+    }
+
+    private static long getUsedMemory() {
+        return getMaxMemory() - getFreeMemory();
+    }
+
+    private static long getTotalMemory() {
+        return Runtime.getRuntime().totalMemory();
+    }
+
+    private static long getFreeMemory() {
+        return Runtime.getRuntime().freeMemory();
+    }
+}
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java
@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
 import lombok.Getter;
 import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.reflect.FieldUtils;
 import org.apache.pdfbox.contentstream.operator.Operator;
 import org.apache.pdfbox.contentstream.operator.OperatorName;
 import org.apache.pdfbox.contentstream.operator.color.*;
@ -195,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
                Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
                        .getWidth(), (float) imageBounds.getHeight());

+                // Memory Hack - sofReference kills me
+                FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true);
+
                if (rect.getHeight() > 2 && rect.getWidth() > 2) {
                    this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber));
                }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java
@ -1,8 +1,6 @@
 package com.iqser.red.service.redaction.v1.server.redaction.model;

-import lombok.AllArgsConstructor;
 import lombok.Data;
-import lombok.NoArgsConstructor;
 import lombok.NonNull;
 import lombok.RequiredArgsConstructor;

@ -10,12 +8,9 @@ import java.awt.geom.Rectangle2D;
 import java.awt.image.BufferedImage;

@Data
-@NoArgsConstructor
-@AllArgsConstructor
@RequiredArgsConstructor
 public class PdfImage {

-    @NonNull
    private BufferedImage image;
    @NonNull
    private Rectangle2D position;
@ -25,4 +20,10 @@ public class PdfImage {
    @NonNull
    private int page;

+    public PdfImage(BufferedImage image, Rectangle2D position, int page) {
+        this.image = image;
+        this.position = position;
+        this.page = page;
+    }
+
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java
@ -1,6 +1,6 @@
 package com.iqser.red.service.redaction.v1.server.redaction.service;

-import com.iqser.red.service.redaction.v1.server.classification.model.Document;
+import com.iqser.red.service.redaction.v1.server.classification.model.Page;
 import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
 import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
 import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
@ -23,37 +23,40 @@ public class ImageClassificationService {
    private final RedactionServiceSettings settings;


-    public void classifyImages(Document classifiedDoc) {
+    public void classifyImages(Page page) {

-        long start = System.currentTimeMillis();
-        classifiedDoc.getPages().forEach(page -> {
-            page.getImages().forEach(image -> {
+        page.getImages().forEach(image -> {

-                if (settings.isEnableImageClassification()) {
-                    try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
-                        ImageIO.write(image.getImage(), "png", baos);
-                        ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
-                                .toByteArray()));
-                        image.setImageType(ImageType.valueOf(response.getCategory()));
+            if (settings.isEnableImageClassification()) {

-                    } catch (IOException e) {
-                        log.error("Could not classify image", e);
+                long start = System.currentTimeMillis();
+                try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+                    ImageIO.write(image.getImage(), "png", baos);
+                    var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
+                    ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
+                    image.setImageType(ImageType.valueOf(response.getCategory()));
+                } catch (IOException e) {
+                    log.error("Could not classify image", e);
+                }
+
+                log.info("Image classification took: " + (System.currentTimeMillis() - start));
+            } else {
+                image.setImageType(ImageType.OTHER);
+            }
+
+            image.getImage().flush();
+            image.setImage(null);
+
+            if (image.getImageType().equals(ImageType.OTHER)) {
+                page.getTextBlocks().forEach(textblock -> {
+                    if (image.getPosition()
+                            .contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
+                        image.setImageType(ImageType.OCR);
                    }
-                } else {
-                    image.setImageType(ImageType.OTHER);
-                }
-
-                if (image.getImageType().equals(ImageType.OTHER)) {
-                    page.getTextBlocks().forEach(textblock -> {
-                        if (image.getPosition()
-                                .contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
-                            image.setImageType(ImageType.OCR);
-                        }
-                    });
-                }
-            });
+                });
+            }
        });
-        log.info("Image classification took: " + (System.currentTimeMillis() - start));
+
    }

 }
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java
@ -13,8 +13,6 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer
 import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
-import org.apache.pdfbox.io.MemoryUsageSetting;
-import org.apache.pdfbox.pdmodel.PDDocument;
 import org.kie.api.runtime.KieContainer;
 import org.springframework.stereotype.Service;
 import org.springframework.web.bind.annotation.RequestBody;
@ -36,27 +34,24 @@ public class ReanalyzeService {
    private final RedactionLogCreatorService redactionLogCreatorService;
    private final RedactionStorageService redactionStorageService;
    private final PdfSegmentationService pdfSegmentationService;
-    private final ImageClassificationService imageClassificationService;
    private final RedactionChangeLogService redactionChangeLogService;
    private final AnalyzeResponseService analyzeResponseService;

    public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
-        var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
+

        var pageCount = 0;
        Document classifiedDoc;
-        try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
-            pdDocument.setAllSecurityToBeRemoved(true);
-            pdDocument.setResourceCache(null);

-            pageCount = pdDocument.getNumberOfPages();
-            classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
+        try {
+            var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
+            classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
+            pageCount = classifiedDoc.getPages().size();
        } catch (Exception e) {
            throw new RedactionException(e);
        }
        log.info("Document structure analysis successful, starting redaction analysis...");

-        imageClassificationService.classifyImages(classifiedDoc);
        entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
        redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
                .getRuleSetId());
--- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java
@ -1,5 +1,6 @@
 package com.iqser.red.service.redaction.v1.server.segmentation;

+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.iqser.red.service.redaction.v1.model.SectionArea;
 import com.iqser.red.service.redaction.v1.server.classification.model.Document;
 import com.iqser.red.service.redaction.v1.server.classification.model.Page;
@ -8,11 +9,12 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
 import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
 import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
 import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
+import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
 import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
 import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
 import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
 import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
-import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
+import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
 import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
 import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
 import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
@ -20,13 +22,18 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingC
 import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.io.MemoryUsageSetting;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.springframework.stereotype.Service;

 import java.awt.geom.Rectangle2D;
+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@ -37,14 +44,15 @@ import java.util.Map;
@RequiredArgsConstructor
 public class PdfSegmentationService {

+    private final static int MAX_PAGES_BEFORE_GC = 200;
+
    private final RulingCleaningService rulingCleaningService;
    private final TableExtractionService tableExtractionService;
    private final BlockificationService blockificationService;
    private final ClassificationService classificationService;
    private final SectionsBuilderService sectionsBuilderService;
-
-
-    private final RedactionStorageService redactionStorageService;
+    private final ImageClassificationService imageClassificationService;
+    private final ObjectMapper objectMapper;


    private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
@ -120,13 +128,27 @@ public class PdfSegmentationService {
    }


-    public Document parseDocument(PDDocument pdDocument) throws IOException {
+    public Document parseDocument(InputStream documentInputStream) throws IOException {

+        //create tempFile
+        File tempFile = File.createTempFile("document", ".pdf");
+        IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
+
+        // initialize required variables
        Document document = new Document();
-
        List<Page> pages = new ArrayList<>();
        PDFLinesTextStripper stripper = new PDFLinesTextStripper();
-        for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) {
+
+        PDDocument pdDocument = reinitializePDDocument(tempFile, null);
+        long pageCount = pdDocument.getNumberOfPages();
+
+        for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
+
+            if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
+                pdDocument = reinitializePDDocument(tempFile, pdDocument);
+            }
+
+
            PDPage pdPage = pdDocument.getPage(pageNumber - 1);
            stripper.setPageNumber(pageNumber);
            stripper.setStartPage(pageNumber);
@ -157,6 +179,9 @@ public class PdfSegmentationService {
            increaseDocumentStatistics(page, document);

            page.setImages(stripper.getImages());
+
+            imageClassificationService.classifyImages(page);
+
            pages.add(page);
        }

@ -166,12 +191,31 @@ public class PdfSegmentationService {
        sectionsBuilderService.buildSections(document);
        sectionsBuilderService.addImagesToSections(document);

+        pdDocument = reinitializePDDocument(tempFile, pdDocument);
+
        // This can be improved an done in one pass, but it's complicated to do right away
        postProcessSections(pdDocument, document.getSectionText());

+        tempFile.delete();
+
        return document;
    }

+    private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
+        if (pdDocument != null) {
+            pdDocument.close();
+        }
+        System.runFinalization();
+        System.gc();
+
+        MemoryStats.printMemoryStats();
+
+        var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly());
+        newPDDocument.setResourceCache(null);
+
+        return newPDDocument;
+    }
+

    private void increaseDocumentStatistics(Page page, Document document) {

@ -203,4 +247,5 @@ public class PdfSegmentationService {

    }

+
 }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java
@ -10,6 +10,7 @@ import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
 import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
 import com.iqser.red.service.redaction.v1.server.client.RulesClient;
 import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
+import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
 import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
 import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
 import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
@ -17,6 +18,7 @@ import com.iqser.red.storage.commons.service.StorageService;
 import lombok.SneakyThrows;
 import org.apache.commons.io.IOUtils;
 import org.junit.Before;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.kie.api.KieServices;
@ -440,6 +442,16 @@ public class RedactionIntegrationTest {
    }


+    @Test
+    @Ignore
+    public void testLargeScannedFileOOM(){
+        AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
+        MemoryStats.printMemoryStats();
+        AnalyzeResult result = redactionController.analyze(request);
+        assertThat(result).isNotNull();
+    }
+
+
    @Test
    public void noExceptionShouldBeThrownForAnyFiles() throws IOException {

@ -509,7 +521,6 @@ public class RedactionIntegrationTest {
    @Test
    public void redactionTest() throws IOException {

-        System.out.println("redactionTest");
        long start = System.currentTimeMillis();
        ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
        AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java
@ -11,7 +11,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
 import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
 import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
 import com.iqser.red.storage.commons.service.StorageService;
-import org.apache.pdfbox.pdmodel.PDDocument;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
@ -140,12 +139,10 @@ public class EntityRedactionServiceTest {
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
    }


@ -168,12 +165,10 @@ public class EntityRedactionServiceTest {
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
    }


@ -195,24 +190,20 @@ public class EntityRedactionServiceTest {
                .entries(Collections.emptyList())
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()
-                    .entrySet()
-                    .stream()
-                    .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()
+                .entrySet()
+                .stream()
+                .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
        pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
                "the plant protection product.pdf");
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()
-                    .entrySet()
-                    .stream()
-                    .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
-        }
+        classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()
+                .entrySet()
+                .stream()
+                .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
    }

    @Test
@ -232,14 +223,12 @@ public class EntityRedactionServiceTest {
                .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1).stream()
-                    .filter(entity -> entity.getMatchedRule() == 9)
-                    .count()).isEqualTo(10);
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1).stream()
+                .filter(entity -> entity.getMatchedRule() == 9)
+                .count()).isEqualTo(10);

    }

@ -301,14 +290,12 @@ public class EntityRedactionServiceTest {
                .entries(Collections.emptyList())
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1).stream()
-                    .filter(entity -> entity.getMatchedRule() == 6)
-                    .count()).isEqualTo(13);
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1).stream()
+                .filter(entity -> entity.getMatchedRule() == 6)
+                .count()).isEqualTo(13);

    }

@ -342,14 +329,12 @@ public class EntityRedactionServiceTest {
                .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1).stream()
-                    .filter(entity -> entity.getMatchedRule() == 11)
-                    .count()).isEqualTo(1);
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1).stream()
+                .filter(entity -> entity.getMatchedRule() == 11)
+                .count()).isEqualTo(1);

    }

@ -374,13 +359,11 @@ public class EntityRedactionServiceTest {
                .entries(Collections.emptyList())
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
-            assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
-            assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
+        assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
+        assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y

        pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");

@ -395,13 +378,11 @@ public class EntityRedactionServiceTest {
                .build();
        when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
-            assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
-        }
+        classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
+        assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
    }


@ -426,12 +407,10 @@ public class EntityRedactionServiceTest {
                .entries(Collections.emptyList())
                .build();
        when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
-            entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
-            assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
-            assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
-        }
+        Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
+        assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
+        assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
    }


@ -510,7 +489,7 @@ public class EntityRedactionServiceTest {
        }
    }

-    private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
+    private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
        List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
        entries.forEach(entry -> {
            dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
--- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java
@ -9,7 +9,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
 import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
 import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
 import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
-import org.apache.pdfbox.pdmodel.PDDocument;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@ -58,19 +57,17 @@ public class PdfSegmentationServiceTest {

        ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document document = pdfSegmentationService.parseDocument(pdDocument);
-            int i = 0;
-            for (Page page : document.getPages()) {
-                for (PdfImage image : page.getImages()) {
-                    try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
-                        ImageIO.write(image.getImage(), "png", baos);
-                        try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
-                            fileOutputStream.write(baos.toByteArray());
-                        }
+        Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        int i = 0;
+        for (Page page : document.getPages()) {
+            for (PdfImage image : page.getImages()) {
+                try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+                    ImageIO.write(image.getImage(), "png", baos);
+                    try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
+                        fileOutputStream.write(baos.toByteArray());
                    }
-                    i++;
                }
+                i++;
            }
        }
    }
@ -81,21 +78,19 @@ public class PdfSegmentationServiceTest {

        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document document = pdfSegmentationService.parseDocument(pdDocument);
-            assertThat(document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())).isNotEmpty();
-            Table table = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(0);
-            assertThat(table.getColCount()).isEqualTo(6);
-            assertThat(table.getRowCount()).isEqualTo(13);
-            assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
-        }
+        Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        assertThat(document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())).isNotEmpty();
+        Table table = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(0);
+        assertThat(table.getColCount()).isEqualTo(6);
+        assertThat(table.getRowCount()).isEqualTo(13);
+        assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
    }


@ -104,38 +99,36 @@ public class PdfSegmentationServiceTest {

        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document document = pdfSegmentationService.parseDocument(pdDocument);
-            assertThat(document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())).isNotEmpty();
-            Table firstTable = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(0);
-            assertThat(firstTable.getColCount()).isEqualTo(8);
-            assertThat(firstTable.getRowCount()).isEqualTo(1);
-            Table secondTable = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(1);
-            assertThat(secondTable.getColCount()).isEqualTo(8);
-            assertThat(secondTable.getRowCount()).isEqualTo(2);
-            List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
-                    .get(0)
-                    .stream()
-                    .map(Collections::singletonList)
-                    .collect(Collectors.toList());
-            assertThat(secondTable.getRows()
-                    .stream()
-                    .allMatch(row -> row.stream()
-                            .map(Cell::getHeaderCells)
-                            .collect(Collectors.toList())
-                            .equals(firstTableHeaderCells))).isTrue();
-        }
+        Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        assertThat(document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())).isNotEmpty();
+        Table firstTable = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(0);
+        assertThat(firstTable.getColCount()).isEqualTo(8);
+        assertThat(firstTable.getRowCount()).isEqualTo(1);
+        Table secondTable = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(1);
+        assertThat(secondTable.getColCount()).isEqualTo(8);
+        assertThat(secondTable.getRowCount()).isEqualTo(2);
+        List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
+                .get(0)
+                .stream()
+                .map(Collections::singletonList)
+                .collect(Collectors.toList());
+        assertThat(secondTable.getRows()
+                .stream()
+                .allMatch(row -> row.stream()
+                        .map(Cell::getHeaderCells)
+                        .collect(Collectors.toList())
+                        .equals(firstTableHeaderCells))).isTrue();
    }


@ -144,38 +137,36 @@ public class PdfSegmentationServiceTest {

        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document document = pdfSegmentationService.parseDocument(pdDocument);
-            assertThat(document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())).isNotEmpty();
-            Table firstTable = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(0);
-            assertThat(firstTable.getColCount()).isEqualTo(9);
-            assertThat(firstTable.getRowCount()).isEqualTo(5);
-            Table secondTable = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(1);
-            assertThat(secondTable.getColCount()).isEqualTo(9);
-            assertThat(secondTable.getRowCount()).isEqualTo(6);
-            List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
-                    .get(firstTable.getRowCount() - 1)
-                    .stream()
-                    .map(Cell::getHeaderCells)
-                    .collect(Collectors.toList());
-            assertThat(secondTable.getRows()
-                    .stream()
-                    .allMatch(row -> row.stream()
-                            .map(Cell::getHeaderCells)
-                            .collect(Collectors.toList())
-                            .equals(firstTableHeaderCells))).isTrue();
-        }
+        Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        assertThat(document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())).isNotEmpty();
+        Table firstTable = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(0);
+        assertThat(firstTable.getColCount()).isEqualTo(9);
+        assertThat(firstTable.getRowCount()).isEqualTo(5);
+        Table secondTable = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(1);
+        assertThat(secondTable.getColCount()).isEqualTo(9);
+        assertThat(secondTable.getRowCount()).isEqualTo(6);
+        List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
+                .get(firstTable.getRowCount() - 1)
+                .stream()
+                .map(Cell::getHeaderCells)
+                .collect(Collectors.toList());
+        assertThat(secondTable.getRows()
+                .stream()
+                .allMatch(row -> row.stream()
+                        .map(Cell::getHeaderCells)
+                        .collect(Collectors.toList())
+                        .equals(firstTableHeaderCells))).isTrue();
    }


@ -184,38 +175,36 @@ public class PdfSegmentationServiceTest {

        ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");

-        try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
-            Document document = pdfSegmentationService.parseDocument(pdDocument);
-            assertThat(document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())).isNotEmpty();
-            Table firstTable = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(0);
-            assertThat(firstTable.getColCount()).isEqualTo(8);
-            assertThat(firstTable.getRowCount()).isEqualTo(1);
-            Table secondTable = document.getParagraphs()
-                    .stream()
-                    .flatMap(paragraph -> paragraph.getTables().stream())
-                    .collect(Collectors.toList())
-                    .get(1);
-            assertThat(secondTable.getColCount()).isEqualTo(8);
-            assertThat(secondTable.getRowCount()).isEqualTo(6);
-            List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
-                    .get(0)
-                    .stream()
-                    .map(Collections::singletonList)
-                    .collect(Collectors.toList());
-            assertThat(secondTable.getRows()
-                    .stream()
-                    .allMatch(row -> row.stream()
-                            .map(Cell::getHeaderCells)
-                            .collect(Collectors.toList())
-                            .equals(firstTableHeaderCells))).isTrue();
-        }
+        Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
+        assertThat(document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())).isNotEmpty();
+        Table firstTable = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(0);
+        assertThat(firstTable.getColCount()).isEqualTo(8);
+        assertThat(firstTable.getRowCount()).isEqualTo(1);
+        Table secondTable = document.getParagraphs()
+                .stream()
+                .flatMap(paragraph -> paragraph.getTables().stream())
+                .collect(Collectors.toList())
+                .get(1);
+        assertThat(secondTable.getColCount()).isEqualTo(8);
+        assertThat(secondTable.getRowCount()).isEqualTo(6);
+        List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
+                .get(0)
+                .stream()
+                .map(Collections::singletonList)
+                .collect(Collectors.toList());
+        assertThat(secondTable.getRows()
+                .stream()
+                .allMatch(row -> row.stream()
+                        .map(Cell::getHeaderCells)
+                        .collect(Collectors.toList())
+                        .equals(firstTableHeaderCells))).isTrue();
    }

 }
--- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf
+++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf