Pull request #140: Improved redaction performance

Merge in RED/redaction-service from improved-redaction-performance to master

* commit '42fcea85d30f914fe02e304e69a551da409c6c3b':
  set image type on error
  proper error handling for image clasification
This commit is contained in:
Timo Bejan 2021-04-18 11:39:27 +02:00
commit 674ebf8eb7
3 changed files with 68 additions and 64 deletions

View File

@ -12,7 +12,6 @@ import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@Slf4j
@Service
@ -35,8 +34,9 @@ public class ImageClassificationService {
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
image.setImageType(ImageType.valueOf(response.getCategory()));
} catch (IOException e) {
} catch (Exception e) {
log.error("Could not classify image", e);
image.setImageType(ImageType.OTHER);
}
log.info("Image classification took: " + (System.currentTimeMillis() - start));

View File

@ -127,80 +127,86 @@ public class PdfSegmentationService {
public Document parseDocument(InputStream documentInputStream) throws IOException {
PDDocument pdDocument = null;
try {
//create tempFile
File tempFile = File.createTempFile("document", ".pdf");
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
//create tempFile
File tempFile = File.createTempFile("document", ".pdf");
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
// initialize required variables
Document document = new Document();
List<Page> pages = new ArrayList<>();
// initialize required variables
Document document = new Document();
List<Page> pages = new ArrayList<>();
PDDocument pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages();
pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
}
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox();
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360;
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
.getMaxCharHeight());
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
page.setRotation(rotation);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
increaseDocumentStatistics(page, document);
page.setImages(stripper.getImages());
imageClassificationService.classifyImages(page);
pages.add(page);
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
}
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
document.setPages(pages);
PDRectangle pdr = pdPage.getMediaBox();
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
classificationService.classifyDocument(document);
sectionsBuilderService.buildSections(document);
sectionsBuilderService.addImagesToSections(document);
int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360;
pdDocument = reinitializePDDocument(tempFile, pdDocument);
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
.getMaxCharHeight());
// This can be improved an done in one pass, but it's complicated to do right away
postProcessSections(pdDocument, document.getSectionText());
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
page.setRotation(rotation);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
increaseDocumentStatistics(page, document);
page.setImages(stripper.getImages());
imageClassificationService.classifyImages(page);
pages.add(page);
IOUtils.close(pdDocument);
tempFile.delete();
return document;
} finally {
if (pdDocument != null) {
pdDocument.close();
}
}
document.setPages(pages);
classificationService.classifyDocument(document);
sectionsBuilderService.buildSections(document);
sectionsBuilderService.addImagesToSections(document);
pdDocument = reinitializePDDocument(tempFile, pdDocument);
// This can be improved an done in one pass, but it's complicated to do right away
postProcessSections(pdDocument, document.getSectionText());
IOUtils.close(pdDocument);
tempFile.delete();
return document;
}
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {

View File

@ -19,7 +19,6 @@ import lombok.SneakyThrows;
import org.apache.commons.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
@ -452,7 +451,6 @@ public class RedactionIntegrationTest {
@Test
@Ignore
public void testLargeScannedFileOOM() {
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
MemoryStats.printMemoryStats();