Pull request #140: Improved redaction performance
Merge in RED/redaction-service from improved-redaction-performance to master * commit '42fcea85d30f914fe02e304e69a551da409c6c3b': set image type on error proper error handling for image clasification
This commit is contained in:
commit
674ebf8eb7
@ -12,7 +12,6 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -35,8 +34,9 @@ public class ImageClassificationService {
|
||||
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
|
||||
image.setImageType(ImageType.valueOf(response.getCategory()));
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
log.error("Could not classify image", e);
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
|
||||
@ -127,80 +127,86 @@ public class PdfSegmentationService {
|
||||
|
||||
|
||||
public Document parseDocument(InputStream documentInputStream) throws IOException {
|
||||
PDDocument pdDocument = null;
|
||||
try {
|
||||
//create tempFile
|
||||
File tempFile = File.createTempFile("document", ".pdf");
|
||||
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
|
||||
|
||||
//create tempFile
|
||||
File tempFile = File.createTempFile("document", ".pdf");
|
||||
IOUtils.copy(documentInputStream, new FileOutputStream(tempFile));
|
||||
|
||||
// initialize required variables
|
||||
Document document = new Document();
|
||||
List<Page> pages = new ArrayList<>();
|
||||
// initialize required variables
|
||||
Document document = new Document();
|
||||
List<Page> pages = new ArrayList<>();
|
||||
|
||||
|
||||
PDDocument pdDocument = reinitializePDDocument(tempFile, null);
|
||||
long pageCount = pdDocument.getNumberOfPages();
|
||||
pdDocument = reinitializePDDocument(tempFile, null);
|
||||
long pageCount = pdDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
}
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
|
||||
.getMaxCharHeight());
|
||||
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
page.setRotation(rotation);
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
|
||||
buildPageStatistics(page);
|
||||
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
page.setImages(stripper.getImages());
|
||||
|
||||
imageClassificationService.classifyImages(page);
|
||||
|
||||
pages.add(page);
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % MAX_PAGES_BEFORE_GC == 0) {
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
}
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
document.setPages(pages);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
|
||||
classificationService.classifyDocument(document);
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isRotated = rotation != 0 && rotation != 360;
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper
|
||||
.getMaxCharHeight());
|
||||
// This can be improved an done in one pass, but it's complicated to do right away
|
||||
postProcessSections(pdDocument, document.getSectionText());
|
||||
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
page.setRotation(rotation);
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, page);
|
||||
|
||||
buildPageStatistics(page);
|
||||
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
page.setImages(stripper.getImages());
|
||||
|
||||
imageClassificationService.classifyImages(page);
|
||||
|
||||
pages.add(page);
|
||||
IOUtils.close(pdDocument);
|
||||
|
||||
tempFile.delete();
|
||||
|
||||
return document;
|
||||
} finally {
|
||||
if (pdDocument != null) {
|
||||
pdDocument.close();
|
||||
}
|
||||
}
|
||||
|
||||
document.setPages(pages);
|
||||
|
||||
classificationService.classifyDocument(document);
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
|
||||
// This can be improved an done in one pass, but it's complicated to do right away
|
||||
postProcessSections(pdDocument, document.getSectionText());
|
||||
|
||||
IOUtils.close(pdDocument);
|
||||
|
||||
tempFile.delete();
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
|
||||
|
||||
@ -19,7 +19,6 @@ import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.KieServices;
|
||||
@ -452,7 +451,6 @@ public class RedactionIntegrationTest {
|
||||
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void testLargeScannedFileOOM() {
|
||||
AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf");
|
||||
MemoryStats.printMemoryStats();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user