"fixed" memory issues by calling GC manually, removing soft reference cache and disposing images properly

This commit is contained in:
Timo 2021-04-16 23:13:12 +03:00
parent 4749858e80
commit 8060e3a29f
12 changed files with 354 additions and 263 deletions

View File

@ -18,6 +18,7 @@ import org.springframework.context.annotation.Import;
public class Application {
public static void main(String[] args) {
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
SpringApplication.run(Application.class, args);
}

View File

@ -79,50 +79,61 @@ public class RedactionController implements RedactionResource {
@Override
public RedactionResult classify(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
pdDocument.setResourceCache(null);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
pdfVisualisationService.visualizeClassifications(classifiedDoc, pdDocument);
return convert(pdDocument, classifiedDoc.getPages().size());
return convert(pdDocument, classifiedDoc.getPages().size());
} catch (IOException e) {
throw new RedactionException(e);
}
} catch (IOException e) {
throw new RedactionException(e);
}
}
@Override
public RedactionResult sections(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
pdfVisualisationService.visualizeParagraphs(classifiedDoc, pdDocument);
return convert(pdDocument, classifiedDoc.getPages().size());
return convert(pdDocument, classifiedDoc.getPages().size());
} catch (IOException e) {
throw new RedactionException(e);
}
} catch (IOException e) {
throw new RedactionException(e);
}
}
@Override
public RedactionResult htmlTables(@RequestBody RedactionRequest redactionRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
Document classifiedDoc;
try (PDDocument pdDocument = PDDocument.load(storedObjectStream)) {
pdDocument.setAllSecurityToBeRemoved(true);
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest.getProjectId(), redactionRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
} catch (Exception e) {
throw new RedactionException(e);
}

View File

@ -0,0 +1,52 @@
package com.iqser.red.service.redaction.v1.server.memory;
import lombok.extern.slf4j.Slf4j;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
@Slf4j
public class MemoryStats {
public static void printMemoryStats() {
log.info("\n\n ------------------------------ \n" +
" Used Memory: " + humanReadableByteCountBin(getUsedMemory()) + "\n" +
" Free Memory: " + humanReadableByteCountBin(getFreeMemory()) + "\n" +
" Total Memory: " + humanReadableByteCountBin(getTotalMemory()) + "\n" +
" Max Memory: " + humanReadableByteCountBin(getMaxMemory()) + "\n" +
"\n ------------------------------ \n");
}
public static String humanReadableByteCountBin(long bytes) {
long absB = bytes == Long.MIN_VALUE ? Long.MAX_VALUE : Math.abs(bytes);
if (absB < 1024) {
return bytes + " B";
}
long value = absB;
CharacterIterator ci = new StringCharacterIterator("KMGTPE");
for (int i = 40; i >= 0 && absB > 0xfffccccccccccccL >> i; i -= 10) {
value >>= 10;
ci.next();
}
value *= Long.signum(bytes);
return String.format("%.1f %ciB", value / 1024.0, ci.current());
}
private static long getMaxMemory() {
return Runtime.getRuntime().maxMemory();
}
private static long getUsedMemory() {
return getMaxMemory() - getFreeMemory();
}
private static long getTotalMemory() {
return Runtime.getRuntime().totalMemory();
}
private static long getFreeMemory() {
return Runtime.getRuntime().freeMemory();
}
}

View File

@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.*;
@ -195,6 +196,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
Rectangle2D rect = new Rectangle2D.Float((float) imageBounds.getX(), (float) imageBounds.getY(), (float) imageBounds
.getWidth(), (float) imageBounds.getHeight());
// Memory Hack - sofReference kills me
FieldUtils.writeField(pdfImage, "cachedImageSubsampling", -1, true);
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber));
}

View File

@ -1,8 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@ -10,12 +8,9 @@ import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
@Data
@NoArgsConstructor
@AllArgsConstructor
@RequiredArgsConstructor
public class PdfImage {
@NonNull
private BufferedImage image;
@NonNull
private Rectangle2D position;
@ -25,4 +20,10 @@ public class PdfImage {
@NonNull
private int page;
public PdfImage(BufferedImage image, Rectangle2D position, int page) {
this.image = image;
this.position = position;
this.page = page;
}
}

View File

@ -1,6 +1,6 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
@ -23,37 +23,40 @@ public class ImageClassificationService {
private final RedactionServiceSettings settings;
public void classifyImages(Document classifiedDoc) {
public void classifyImages(Page page) {
long start = System.currentTimeMillis();
classifiedDoc.getPages().forEach(page -> {
page.getImages().forEach(image -> {
page.getImages().forEach(image -> {
if (settings.isEnableImageClassification()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
.toByteArray()));
image.setImageType(ImageType.valueOf(response.getCategory()));
if (settings.isEnableImageClassification()) {
} catch (IOException e) {
log.error("Could not classify image", e);
long start = System.currentTimeMillis();
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
image.setImageType(ImageType.valueOf(response.getCategory()));
} catch (IOException e) {
log.error("Could not classify image", e);
}
log.info("Image classification took: " + (System.currentTimeMillis() - start));
} else {
image.setImageType(ImageType.OTHER);
}
image.getImage().flush();
image.setImage(null);
if (image.getImageType().equals(ImageType.OTHER)) {
page.getTextBlocks().forEach(textblock -> {
if (image.getPosition()
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
} else {
image.setImageType(ImageType.OTHER);
}
if (image.getImageType().equals(ImageType.OTHER)) {
page.getTextBlocks().forEach(textblock -> {
if (image.getPosition()
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
});
}
});
});
}
});
log.info("Image classification took: " + (System.currentTimeMillis() - start));
}
}

View File

@ -13,8 +13,6 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
@ -36,27 +34,24 @@ public class ReanalyzeService {
private final RedactionLogCreatorService redactionLogCreatorService;
private final RedactionStorageService redactionStorageService;
private final PdfSegmentationService pdfSegmentationService;
private final ImageClassificationService imageClassificationService;
private final RedactionChangeLogService redactionChangeLogService;
private final AnalyzeResponseService analyzeResponseService;
public AnalyzeResult analyze(AnalyzeRequest analyzeRequest) {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
var pageCount = 0;
Document classifiedDoc;
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
pdDocument.setResourceCache(null);
pageCount = pdDocument.getNumberOfPages();
classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);
}
log.info("Document structure analysis successful, starting redaction analysis...");
imageClassificationService.classifyImages(classifiedDoc);
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
@ -8,11 +9,12 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
@ -20,13 +22,18 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingC
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -37,14 +44,15 @@ import java.util.Map;
@RequiredArgsConstructor
public class PdfSegmentationService {
private final static int MAX_PAGES_BEFORE_GC = 200;
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final BlockificationService blockificationService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final RedactionStorageService redactionStorageService;
private final ImageClassificationService imageClassificationService;
private final ObjectMapper objectMapper;
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
@ -120,13 +128,27 @@ public class PdfSegmentationService {
}
public Document parseDocument(PDDocument pdDocument) throws IOException {
public Document parseDocument(InputStream documentInputStream) throws IOException {
//create tempFile
File tempFile = File.createTempFile("document", ".pdf");
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
// initialize required variables
Document document = new Document();
List<Page> pages = new ArrayList<>();
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
for (int pageNumber = 1; pageNumber <= pdDocument.getNumberOfPages(); pageNumber++) {
PDDocument pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
}
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
@ -157,6 +179,9 @@ public class PdfSegmentationService {
increaseDocumentStatistics(page, document);
page.setImages(stripper.getImages());
imageClassificationService.classifyImages(page);
pages.add(page);
}
@ -166,12 +191,31 @@ public class PdfSegmentationService {
sectionsBuilderService.buildSections(document);
sectionsBuilderService.addImagesToSections(document);
pdDocument = reinitializePDDocument(tempFile, pdDocument);
// This can be improved an done in one pass, but it's complicated to do right away
postProcessSections(pdDocument, document.getSectionText());
tempFile.delete();
return document;
}
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
if (pdDocument != null) {
pdDocument.close();
}
System.runFinalization();
System.gc();
MemoryStats.printMemoryStats();
var newPDDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupTempFileOnly());
newPDDocument.setResourceCache(null);
return newPDDocument;
}
private void increaseDocumentStatistics(Page page, Document document) {
@ -203,4 +247,5 @@ public class PdfSegmentationService {
}
}

View File

@ -10,6 +10,7 @@ import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
@ -17,6 +18,7 @@ import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
@ -440,6 +442,16 @@ public class RedactionIntegrationTest {
}
@Test
@Ignore
public void testLargeScannedFileOOM(){
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
MemoryStats.printMemoryStats();
AnalyzeResult result = redactionController.analyze(request);
assertThat(result).isNotNull();
}
@Test
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
@ -509,7 +521,6 @@ public class RedactionIntegrationTest {
@Test
public void redactionTest() throws IOException {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());

View File

@ -11,7 +11,6 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUti
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.storage.commons.service.StorageService;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
@ -140,12 +139,10 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@ -168,12 +165,10 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1)).hasSize(7);// 3 author cells, 1 address, 1 Y and 2 N entities
}
@ -195,24 +190,20 @@ public class EntityRedactionServiceTest {
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
pdfFileResource = new ClassPathResource("files/Compounds/27 A8637C - EU AIR3 - MCP Section 1 - Identity of " +
"the plant protection product.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()
.entrySet()
.stream()
.noneMatch(entry -> entry.getValue().stream().anyMatch(e -> e.getMatchedRule() == 9))).isTrue();
}
@Test
@ -232,14 +223,12 @@ public class EntityRedactionServiceTest {
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 9)
.count()).isEqualTo(10);
}
@ -301,14 +290,12 @@ public class EntityRedactionServiceTest {
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(13);
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 6)
.count()).isEqualTo(13);
}
@ -342,14 +329,12 @@ public class EntityRedactionServiceTest {
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
.count()).isEqualTo(1);
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream()
.filter(entity -> entity.getMatchedRule() == 11)
.count()).isEqualTo(1);
}
@ -374,13 +359,11 @@ public class EntityRedactionServiceTest {
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(2); // two pages
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(8);
assertThat(classifiedDoc.getEntities().get(2).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(5); // 2 names, 1 address, 2 Y
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
@ -395,13 +378,11 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
}
classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 9).count()).isEqualTo(3);
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(9);
}
@ -426,12 +407,10 @@ public class EntityRedactionServiceTest {
.entries(Collections.emptyList())
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument);
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
Document classifiedDoc = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
entityRedactionService.processDocument(classifiedDoc, TEST_RULESET_ID, null);
assertThat(classifiedDoc.getEntities()).hasSize(1); // one page
assertThat(classifiedDoc.getEntities().get(1).stream().filter(entity -> entity.getMatchedRule() == 8).count()).isEqualTo(6);
}
@ -510,7 +489,7 @@ public class EntityRedactionServiceTest {
}
}
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
private List<DictionaryEntry> toDictionaryEntry(List<String> entries) {
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));

View File

@ -9,7 +9,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
@ -58,19 +57,17 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
int i = 0;
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
fileOutputStream.write(baos.toByteArray());
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
int i = 0;
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
fileOutputStream.write(baos.toByteArray());
}
i++;
}
i++;
}
}
}
@ -81,21 +78,19 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table table = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table table = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
@ -104,38 +99,36 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
@ -144,38 +137,36 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
@ -184,38 +175,36 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
Document document = pdfSegmentationService.parseDocument(pdDocument);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())).isNotEmpty();
Table firstTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
Table secondTable = document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
.collect(Collectors.toList())
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList())
.equals(firstTableHeaderCells))).isTrue();
}
}