"fixed" memory issues by calling GC manually, removing soft reference cache and disposing images properly
This commit is contained in:
parent
4749858e80
commit
8060e3a29f
@ -18,6 +18,7 @@ import org.springframework.context.annotation.Import;
|
||||
public class Application {
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
|
||||
|
||||
@ -79,50 +79,61 @@ public class RedactionController implements RedactionResource {
|
||||
@Override
|
||||
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
pdDocument.setResourceCache(null);
|
||||
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
|
||||
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
|
||||
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
|
||||
return convert(pdDocument, classifiedDoc.getPages().size());
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {
|
||||
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
Document classifiedDoc;
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
|
||||
try {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
|
||||
@ -0,0 +1,52 @@
|
||||
package com.iqser.red.service.redaction.v1.server.memory;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
|
||||
@Slf4j
|
||||
public class MemoryStats {
|
||||
|
||||
|
||||
public static void printMemoryStats() {
|
||||
log.info("\n\n ------------------------------ \n" +
|
||||
" Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" +
|
||||
" Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" +
|
||||
" Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" +
|
||||
" Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" +
|
||||
"\n ------------------------------ \n");
|
||||
}
|
||||
|
||||
|
||||
public static String humanReadableByteCountBin(long bytes) {
|
||||
long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes);
|
||||
if (absB < 1024) {
|
||||
return bytes + " B";
|
||||
}
|
||||
long value = absB;
|
||||
CharacterIterator ci = new StringCharacterIterator("KMGTPE");
|
||||
for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) {
|
||||
value >>= 10;
|
||||
ci.next();
|
||||
}
|
||||
value *= Long.signum(bytes);
|
||||
return String.format("%.1f %ciB", value / 1024.0, ci.current());
|
||||
}
|
||||
|
||||
private static long getMaxMemory() {
|
||||
return Runtime.getRuntime().maxMemory();
|
||||
}
|
||||
|
||||
private static long getUsedMemory() {
|
||||
return getMaxMemory() - getFreeMemory();
|
||||
}
|
||||
|
||||
private static long getTotalMemory() {
|
||||
return Runtime.getRuntime().totalMemory();
|
||||
}
|
||||
|
||||
private static long getFreeMemory() {
|
||||
return Runtime.getRuntime().freeMemory();
|
||||
}
|
||||
}
|
||||
@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.reflect.FieldUtils;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
||||
@ -195,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
|
||||
.getWidth(), (float) imageBounds.getHeight());
|
||||
|
||||
// Memory Hack - sofReference kills me
|
||||
FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true);
|
||||
|
||||
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
|
||||
this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber));
|
||||
}
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -10,12 +8,9 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class PdfImage {
|
||||
|
||||
@NonNull
|
||||
private BufferedImage image;
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
@ -25,4 +20,10 @@ public class PdfImage {
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
|
||||
this.image = image;
|
||||
this.position = position;
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
|
||||
@ -23,37 +23,40 @@ public class ImageClassificationService {
|
||||
private final RedactionServiceSettings settings;
|
||||
|
||||
|
||||
public void classifyImages(Document classifiedDoc) {
|
||||
public void classifyImages(Page page) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
classifiedDoc.getPages().forEach(page -> {
|
||||
page.getImages().forEach(image -> {
|
||||
page.getImages().forEach(image -> {
|
||||
|
||||
if (settings.isEnableImageClassification()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
|
||||
.toByteArray()));
|
||||
image.setImageType(ImageType.valueOf(response.getCategory()));
|
||||
if (settings.isEnableImageClassification()) {
|
||||
|
||||
} catch (IOException e) {
|
||||
log.error("Could not classify image", e);
|
||||
long start = System.currentTimeMillis();
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
|
||||
image.setImageType(ImageType.valueOf(response.getCategory()));
|
||||
} catch (IOException e) {
|
||||
log.error("Could not classify image", e);
|
||||
}
|
||||
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
} else {
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
image.getImage().flush();
|
||||
image.setImage(null);
|
||||
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
page.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition()
|
||||
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
}
|
||||
} else {
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
page.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition()
|
||||
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -13,8 +13,6 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
@ -36,27 +34,24 @@ public class ReanalyzeService {
|
||||
private final RedactionLogCreatorService redactionLogCreatorService;
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
private final PdfSegmentationService pdfSegmentationService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
private final RedactionChangeLogService redactionChangeLogService;
|
||||
private final AnalyzeResponseService analyzeResponseService;
|
||||
|
||||
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
|
||||
var pageCount = 0;
|
||||
Document classifiedDoc;
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
pdDocument.setResourceCache(null);
|
||||
|
||||
pageCount = pdDocument.getNumberOfPages();
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
try {
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
|
||||
pageCount = classifiedDoc.getPages().size();
|
||||
} catch (Exception e) {
|
||||
throw new RedactionException(e);
|
||||
}
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
@ -8,11 +9,12 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
|
||||
@ -20,13 +22,18 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingC
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -37,14 +44,15 @@ import java.util.Map;
|
||||
@RequiredArgsConstructor
|
||||
public class PdfSegmentationService {
|
||||
|
||||
private final static int MAX_PAGES_BEFORE_GC = 200;
|
||||
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final BlockificationService blockificationService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
private final RedactionStorageService redactionStorageService;
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
|
||||
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
|
||||
@ -120,13 +128,27 @@ public class PdfSegmentationService {
|
||||
}
|
||||
|
||||
|
||||
public Document parseDocument(PDDocument pdDocument) throws IOException {
|
||||
public Document parseDocument(InputStream documentInputStream) throws IOException {
|
||||
|
||||
//create tempFile
|
||||
File tempFile = File.createTempFile("document", ".pdf");
|
||||
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
|
||||
|
||||
// initialize required variables
|
||||
Document document = new Document();
|
||||
|
||||
List<Page> pages = new ArrayList<>();
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
|
||||
PDDocument pdDocument = reinitializePDDocument(tempFile, null);
|
||||
long pageCount = pdDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
}
|
||||
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
@ -157,6 +179,9 @@ public class PdfSegmentationService {
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
page.setImages(stripper.getImages());
|
||||
|
||||
imageClassificationService.classifyImages(page);
|
||||
|
||||
pages.add(page);
|
||||
}
|
||||
|
||||
@ -166,12 +191,31 @@ public class PdfSegmentationService {
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
|
||||
// This can be improved an done in one pass, but it's complicated to do right away
|
||||
postProcessSections(pdDocument, document.getSectionText());
|
||||
|
||||
tempFile.delete();
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
|
||||
if (pdDocument != null) {
|
||||
pdDocument.close();
|
||||
}
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
|
||||
MemoryStats.printMemoryStats();
|
||||
|
||||
var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly());
|
||||
newPDDocument.setResourceCache(null);
|
||||
|
||||
return newPDDocument;
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(Page page, Document document) {
|
||||
|
||||
@ -203,4 +247,5 @@ public class PdfSegmentationService {
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -10,6 +10,7 @@ import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
@ -17,6 +18,7 @@ import com.iqser.red.storage.commons.service.StorageService;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.KieServices;
|
||||
@ -440,6 +442,16 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void testLargeScannedFileOOM(){
|
||||
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
|
||||
MemoryStats.printMemoryStats();
|
||||
AnalyzeResult result = redactionController.analyze(request);
|
||||
assertThat(result).isNotNull();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
|
||||
|
||||
@ -509,7 +521,6 @@ public class RedactionIntegrationTest {
|
||||
@Test
|
||||
public void redactionTest() throws IOException {
|
||||
|
||||
System.out.println("redactionTest");
|
||||
long start = System.currentTimeMillis();
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
|
||||
|
||||
@ -11,7 +11,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
@ -140,12 +139,10 @@ public class EntityRedactionServiceTest {
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
|
||||
|
||||
@ -168,12 +165,10 @@ public class EntityRedactionServiceTest {
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
|
||||
}
|
||||
|
||||
|
||||
@ -195,24 +190,20 @@ public class EntityRedactionServiceTest {
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
|
||||
"the plant protection product.pdf");
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
}
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -232,14 +223,12 @@ public class EntityRedactionServiceTest {
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 9)
|
||||
.count()).isEqualTo(10);
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 9)
|
||||
.count()).isEqualTo(10);
|
||||
|
||||
}
|
||||
|
||||
@ -301,14 +290,12 @@ public class EntityRedactionServiceTest {
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 6)
|
||||
.count()).isEqualTo(13);
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 6)
|
||||
.count()).isEqualTo(13);
|
||||
|
||||
}
|
||||
|
||||
@ -342,14 +329,12 @@ public class EntityRedactionServiceTest {
|
||||
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 11)
|
||||
.count()).isEqualTo(1);
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream()
|
||||
.filter(entity -> entity.getMatchedRule() == 11)
|
||||
.count()).isEqualTo(1);
|
||||
|
||||
}
|
||||
|
||||
@ -374,13 +359,11 @@ public class EntityRedactionServiceTest {
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
|
||||
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
|
||||
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
|
||||
|
||||
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
|
||||
|
||||
@ -395,13 +378,11 @@ public class EntityRedactionServiceTest {
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
|
||||
}
|
||||
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
|
||||
}
|
||||
|
||||
|
||||
@ -426,12 +407,10 @@ public class EntityRedactionServiceTest {
|
||||
.entries(Collections.emptyList())
|
||||
.build();
|
||||
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
|
||||
}
|
||||
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
|
||||
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
|
||||
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
|
||||
}
|
||||
|
||||
|
||||
@ -510,7 +489,7 @@ public class EntityRedactionServiceTest {
|
||||
}
|
||||
}
|
||||
|
||||
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
|
||||
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
|
||||
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
|
||||
entries.forEach(entry -> {
|
||||
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
|
||||
|
||||
@ -9,7 +9,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
@ -58,19 +57,17 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
int i = 0;
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
|
||||
fileOutputStream.write(baos.toByteArray());
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
int i = 0;
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
|
||||
fileOutputStream.write(baos.toByteArray());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -81,21 +78,19 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
|
||||
|
||||
@ -104,38 +99,36 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -144,38 +137,36 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -184,38 +175,36 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table firstTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
Table secondTable = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user