diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java index d3933b0c..58e2d815 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/Application.java @@ -18,6 +18,7 @@ import org.springframework.context.annotation.Import; public class Application { public static void main(String[] args) { + System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); SpringApplication.run(Application.class, args); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index 496f28f3..63d27c59 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -79,50 +79,61 @@ public class RedactionController implements RedactionResource { @Override public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) { var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try { + Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); - try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { - pdDocument.setAllSecurityToBeRemoved(true); - pdDocument.setResourceCache(null); + storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { + pdDocument.setAllSecurityToBeRemoved(true); - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument); + pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument); - return convert(pdDocument, classifiedDoc.getPages().size()); + return convert(pdDocument, classifiedDoc.getPages().size()); + + } catch (IOException e) { + throw new RedactionException(e); + } } catch (IOException e) { throw new RedactionException(e); } + } @Override public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) { var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try { + Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); - try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { - pdDocument.setAllSecurityToBeRemoved(true); + storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { + pdDocument.setAllSecurityToBeRemoved(true); - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument); + pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument); + return convert(pdDocument, classifiedDoc.getPages().size()); - return convert(pdDocument, classifiedDoc.getPages().size()); + } catch (IOException e) { + throw new RedactionException(e); + } } catch (IOException e) { throw new RedactionException(e); } + } @Override public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) { - var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); - Document classifiedDoc; - try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) { - pdDocument.setAllSecurityToBeRemoved(true); - classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + + try { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN)); + classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); } catch (Exception e) { throw new RedactionException(e); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java new file mode 100644 index 00000000..2491d9eb --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/memory/MemoryStats.java @@ -0,0 +1,52 @@ +package com.iqser.red.service.redaction.v1.server.memory; + +import lombok.extern.slf4j.Slf4j; + +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; + +@Slf4j +public class MemoryStats { + + + public static void printMemoryStats() { + log.info("\n\n ------------------------------ \n" + + " Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" + + " Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" + + " Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" + + " Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" + + "\n ------------------------------ \n"); + } + + + public static String humanReadableByteCountBin(long bytes) { + long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes); + if (absB < 1024) { + return bytes + " B"; + } + long value = absB; + CharacterIterator ci = new StringCharacterIterator("KMGTPE"); + for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) { + value >>= 10; + ci.next(); + } + value *= Long.signum(bytes); + return String.format("%.1f %ciB", value / 1024.0, ci.current()); + } + + private static long getMaxMemory() { + return Runtime.getRuntime().maxMemory(); + } + + private static long getUsedMemory() { + return getMaxMemory() - getFreeMemory(); + } + + private static long getTotalMemory() { + return Runtime.getRuntime().totalMemory(); + } + + private static long getFreeMemory() { + return Runtime.getRuntime().freeMemory(); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index 26933528..aa69cbbc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; import org.apache.pdfbox.contentstream.operator.color.*; @@ -195,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper { Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds .getWidth(), (float) imageBounds.getHeight()); + // Memory Hack - sofReference kills me + FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true); + if (rect.getHeight() > 2 && rect.getWidth() > 2) { this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber)); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java index 4e60f6e4..f7f6ad4f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java @@ -1,8 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; -import lombok.AllArgsConstructor; import lombok.Data; -import lombok.NoArgsConstructor; import lombok.NonNull; import lombok.RequiredArgsConstructor; @@ -10,12 +8,9 @@ import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; @Data -@NoArgsConstructor -@AllArgsConstructor @RequiredArgsConstructor public class PdfImage { - @NonNull private BufferedImage image; @NonNull private Rectangle2D position; @@ -25,4 +20,10 @@ public class PdfImage { @NonNull private int page; + public PdfImage(BufferedImage image, Rectangle2D position, int page) { + this.image = image; + this.position = position; + this.page = page; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java index 6682a23d..44012156 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java @@ -1,6 +1,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse; import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile; @@ -23,37 +23,40 @@ public class ImageClassificationService { private final RedactionServiceSettings settings; - public void classifyImages(Document classifiedDoc) { + public void classifyImages(Page page) { - long start = System.currentTimeMillis(); - classifiedDoc.getPages().forEach(page -> { - page.getImages().forEach(image -> { + page.getImages().forEach(image -> { - if (settings.isEnableImageClassification()) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - ImageIO.write(image.getImage(), "png", baos); - ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos - .toByteArray())); - image.setImageType(ImageType.valueOf(response.getCategory())); + if (settings.isEnableImageClassification()) { - } catch (IOException e) { - log.error("Could not classify image", e); + long start = System.currentTimeMillis(); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + ImageIO.write(image.getImage(), "png", baos); + var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray()); + ImageClassificationResponse response = imageClassificationClient.classify(mockFile); + image.setImageType(ImageType.valueOf(response.getCategory())); + } catch (IOException e) { + log.error("Could not classify image", e); + } + + log.info("Image classification took: " + (System.currentTimeMillis() - start)); + } else { + image.setImageType(ImageType.OTHER); + } + + image.getImage().flush(); + image.setImage(null); + + if (image.getImageType().equals(ImageType.OTHER)) { + page.getTextBlocks().forEach(textblock -> { + if (image.getPosition() + .contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { + image.setImageType(ImageType.OCR); } - } else { - image.setImageType(ImageType.OTHER); - } - - if (image.getImageType().equals(ImageType.OTHER)) { - page.getTextBlocks().forEach(textblock -> { - if (image.getPosition() - .contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { - image.setImageType(ImageType.OCR); - } - }); - } - }); + }); + } }); - log.info("Image classification took: " + (System.currentTimeMillis() - start)); + } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 152dd84d..2ebc57f4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -13,8 +13,6 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.pdmodel.PDDocument; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; @@ -36,27 +34,24 @@ public class ReanalyzeService { private final RedactionLogCreatorService redactionLogCreatorService; private final RedactionStorageService redactionStorageService; private final PdfSegmentationService pdfSegmentationService; - private final ImageClassificationService imageClassificationService; private final RedactionChangeLogService redactionChangeLogService; private final AnalyzeResponseService analyzeResponseService; public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) { - var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); + var pageCount = 0; Document classifiedDoc; - try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { - pdDocument.setAllSecurityToBeRemoved(true); - pdDocument.setResourceCache(null); - pageCount = pdDocument.getNumberOfPages(); - classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + try { + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); + classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream); + pageCount = classifiedDoc.getPages().size(); } catch (Exception e) { throw new RedactionException(e); } log.info("Document structure analysis successful, starting redaction analysis..."); - imageClassificationService.classifyImages(classifiedDoc); entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest .getRuleSetId()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 77e088a5..d4633127 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.segmentation; +import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; @@ -8,11 +9,12 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; import com.iqser.red.service.redaction.v1.server.exception.RedactionException; +import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; -import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; @@ -20,13 +22,18 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingC import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; import java.awt.geom.Rectangle2D; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -37,14 +44,15 @@ import java.util.Map; @RequiredArgsConstructor public class PdfSegmentationService { + private final static int MAX_PAGES_BEFORE_GC = 200; + private final RulingCleaningService rulingCleaningService; private final TableExtractionService tableExtractionService; private final BlockificationService blockificationService; private final ClassificationService classificationService; private final SectionsBuilderService sectionsBuilderService; - - - private final RedactionStorageService redactionStorageService; + private final ImageClassificationService imageClassificationService; + private final ObjectMapper objectMapper; private void postProcessSections(PDDocument pdDocument, List texts) { @@ -120,13 +128,27 @@ public class PdfSegmentationService { } - public Document parseDocument(PDDocument pdDocument) throws IOException { + public Document parseDocument(InputStream documentInputStream) throws IOException { + //create tempFile + File tempFile = File.createTempFile("document", ".pdf"); + IOUtils.copy(documentInputStream, new FileOutputStream(tempFile)); + + // initialize required variables Document document = new Document(); - List pages = new ArrayList<>(); PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) { + + PDDocument pdDocument = reinitializePDDocument(tempFile, null); + long pageCount = pdDocument.getNumberOfPages(); + + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + + if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { + pdDocument = reinitializePDDocument(tempFile, pdDocument); + } + + PDPage pdPage = pdDocument.getPage(pageNumber - 1); stripper.setPageNumber(pageNumber); stripper.setStartPage(pageNumber); @@ -157,6 +179,9 @@ public class PdfSegmentationService { increaseDocumentStatistics(page, document); page.setImages(stripper.getImages()); + + imageClassificationService.classifyImages(page); + pages.add(page); } @@ -166,12 +191,31 @@ public class PdfSegmentationService { sectionsBuilderService.buildSections(document); sectionsBuilderService.addImagesToSections(document); + pdDocument = reinitializePDDocument(tempFile, pdDocument); + // This can be improved an done in one pass, but it's complicated to do right away postProcessSections(pdDocument, document.getSectionText()); + tempFile.delete(); + return document; } + private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException { + if (pdDocument != null) { + pdDocument.close(); + } + System.runFinalization(); + System.gc(); + + MemoryStats.printMemoryStats(); + + var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly()); + newPDDocument.setResourceCache(null); + + return newPDDocument; + } + private void increaseDocumentStatistics(Page page, Document document) { @@ -203,4 +247,5 @@ public class PdfSegmentationService { } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 75384b34..3e4dc0eb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -10,6 +10,7 @@ import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.controller.RedactionController; +import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; @@ -17,6 +18,7 @@ import com.iqser.red.storage.commons.service.StorageService; import lombok.SneakyThrows; import org.apache.commons.io.IOUtils; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.KieServices; @@ -440,6 +442,16 @@ public class RedactionIntegrationTest { } + @Test + @Ignore + public void testLargeScannedFileOOM(){ + AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf"); + MemoryStats.printMemoryStats(); + AnalyzeResult result = redactionController.analyze(request); + assertThat(result).isNotNull(); + } + + @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { @@ -509,7 +521,6 @@ public class RedactionIntegrationTest { @Test public void redactionTest() throws IOException { - System.out.println("redactionTest"); long start = System.currentTimeMillis(); ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 95c8ebf8..25213df4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -11,7 +11,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.storage.commons.service.StorageService; -import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; @@ -140,12 +139,10 @@ public class EntityRedactionServiceTest { .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities } @@ -168,12 +165,10 @@ public class EntityRedactionServiceTest { .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities } @@ -195,24 +190,20 @@ public class EntityRedactionServiceTest { .entries(Collections.emptyList()) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities() - .entrySet() - .stream() - .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " + "the plant protection product.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities() - .entrySet() - .stream() - .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); - } + classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities() + .entrySet() + .stream() + .noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue(); } @Test @@ -232,14 +223,12 @@ public class EntityRedactionServiceTest { .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream() - .filter(entity -> entity.getMatchedRule() == 9) - .count()).isEqualTo(10); - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 9) + .count()).isEqualTo(10); } @@ -301,14 +290,12 @@ public class EntityRedactionServiceTest { .entries(Collections.emptyList()) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream() - .filter(entity -> entity.getMatchedRule() == 6) - .count()).isEqualTo(13); - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 6) + .count()).isEqualTo(13); } @@ -342,14 +329,12 @@ public class EntityRedactionServiceTest { .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream() - .filter(entity -> entity.getMatchedRule() == 11) - .count()).isEqualTo(1); - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream() + .filter(entity -> entity.getMatchedRule() == 11) + .count()).isEqualTo(1); } @@ -374,13 +359,11 @@ public class EntityRedactionServiceTest { .entries(Collections.emptyList()) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8); - assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8); + assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf"); @@ -395,13 +378,11 @@ public class EntityRedactionServiceTest { .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3); - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9); - } + classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3); + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9); } @@ -426,12 +407,10 @@ public class EntityRedactionServiceTest { .entries(Collections.emptyList()) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); - entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); - assertThat(classifiedDoc.getEntities()).hasSize(1); // one page - assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6); - } + Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6); } @@ -510,7 +489,7 @@ public class EntityRedactionServiceTest { } } - private List toDictionaryEntry(List entries){ + private List toDictionaryEntry(List entries) { List dictionaryEntries = new ArrayList<>(); entries.forEach(entry -> { dictionaryEntries.add(new DictionaryEntry(entry, 1L, false)); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index 4d83412a..c7bb136f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -9,7 +9,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; -import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; @@ -58,19 +57,17 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - int i = 0; - for (Page page : document.getPages()) { - for (PdfImage image : page.getImages()) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - ImageIO.write(image.getImage(), "png", baos); - try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) { - fileOutputStream.write(baos.toByteArray()); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + int i = 0; + for (Page page : document.getPages()) { + for (PdfImage image : page.getImages()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + ImageIO.write(image.getImage(), "png", baos); + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) { + fileOutputStream.write(baos.toByteArray()); } - i++; } + i++; } } } @@ -81,21 +78,19 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table table = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(table.getColCount()).isEqualTo(6); - assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table table = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(table.getColCount()).isEqualTo(6); + assertThat(table.getRowCount()).isEqualTo(13); + assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); } @@ -104,38 +99,36 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table firstTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(firstTable.getColCount()).isEqualTo(8); - assertThat(firstTable.getRowCount()).isEqualTo(1); - Table secondTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(1); - assertThat(secondTable.getColCount()).isEqualTo(8); - assertThat(secondTable.getRowCount()).isEqualTo(2); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()) - .equals(firstTableHeaderCells))).isTrue(); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(8); + assertThat(firstTable.getRowCount()).isEqualTo(1); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(8); + assertThat(secondTable.getRowCount()).isEqualTo(2); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))).isTrue(); } @@ -144,38 +137,36 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table firstTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(firstTable.getColCount()).isEqualTo(9); - assertThat(firstTable.getRowCount()).isEqualTo(5); - Table secondTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(1); - assertThat(secondTable.getColCount()).isEqualTo(9); - assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(firstTable.getRowCount() - 1) - .stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()) - .equals(firstTableHeaderCells))).isTrue(); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(9); + assertThat(firstTable.getRowCount()).isEqualTo(5); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(9); + assertThat(secondTable.getRowCount()).isEqualTo(6); + List> firstTableHeaderCells = firstTable.getRows() + .get(firstTable.getRowCount() - 1) + .stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))).isTrue(); } @@ -184,38 +175,36 @@ public class PdfSegmentationServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); - try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { - Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table firstTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(firstTable.getColCount()).isEqualTo(8); - assertThat(firstTable.getRowCount()).isEqualTo(1); - Table secondTable = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(1); - assertThat(secondTable.getColCount()).isEqualTo(8); - assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()) - .equals(firstTableHeaderCells))).isTrue(); - } + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList())).isNotEmpty(); + Table firstTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(0); + assertThat(firstTable.getColCount()).isEqualTo(8); + assertThat(firstTable.getRowCount()).isEqualTo(1); + Table secondTable = document.getParagraphs() + .stream() + .flatMap(paragraph -> paragraph.getTables().stream()) + .collect(Collectors.toList()) + .get(1); + assertThat(secondTable.getColCount()).isEqualTo(8); + assertThat(secondTable.getRowCount()).isEqualTo(6); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()) + .equals(firstTableHeaderCells))).isTrue(); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf new file mode 100644 index 00000000..0fe661c4 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/scanned/VV-377031.pdf differ