From a34d2fb6753579a9693322f9a29098c5cb0e3876 Mon Sep 17 00:00:00 2001 From: Timo Date: Sun, 18 Apr 2021 11:30:53 +0300 Subject: [PATCH 1/2] proper error handling for image clasification --- .../service/ImageClassificationService.java | 2 +- .../segmentation/PdfSegmentationService.java | 126 +++++++++--------- .../v1/server/RedactionIntegrationTest.java | 2 - 3 files changed, 67 insertions(+), 63 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java index 44012156..ddd945ab 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java @@ -35,7 +35,7 @@ public class ImageClassificationService { var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray()); ImageClassificationResponse response = imageClassificationClient.classify(mockFile); image.setImageType(ImageType.valueOf(response.getCategory())); - } catch (IOException e) { + } catch (Exception e) { log.error("Could not classify image", e); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 1e88a7c4..2eb06c3d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -127,80 +127,86 @@ public class PdfSegmentationService { public Document parseDocument(InputStream documentInputStream) throws IOException { + PDDocument pdDocument = null; + try { + //create tempFile + File tempFile = File.createTempFile("document", ".pdf"); + IOUtils.copy(documentInputStream, new FileOutputStream(tempFile)); - //create tempFile - File tempFile = File.createTempFile("document", ".pdf"); - IOUtils.copy(documentInputStream, new FileOutputStream(tempFile)); - - // initialize required variables - Document document = new Document(); - List pages = new ArrayList<>(); + // initialize required variables + Document document = new Document(); + List pages = new ArrayList<>(); - PDDocument pdDocument = reinitializePDDocument(tempFile, null); - long pageCount = pdDocument.getNumberOfPages(); + pdDocument = reinitializePDDocument(tempFile, null); + long pageCount = pdDocument.getNumberOfPages(); + + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + + if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { + pdDocument = reinitializePDDocument(tempFile, pdDocument); + } + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); + + PDRectangle pdr = pdPage.getMediaBox(); + boolean isLandscape = pdr.getWidth() > pdr.getHeight(); + + int rotation = pdPage.getRotation(); + boolean isRotated = rotation != 0 && rotation != 360; + + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper + .getMaxCharHeight()); + + Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings + .getVertical()); + page.setRotation(rotation); + + tableExtractionService.extractTables(cleanRulings, page); + + buildPageStatistics(page); + + page.setLandscape(isLandscape || isRotated); + + page.setPageNumber(pageNumber); + increaseDocumentStatistics(page, document); + + page.setImages(stripper.getImages()); + + imageClassificationService.classifyImages(page); + + pages.add(page); - for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { - if (pageNumber % MAX_PAGES_BEFORE_GC == 0) { - pdDocument = reinitializePDDocument(tempFile, pdDocument); } - PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - PDPage pdPage = pdDocument.getPage(pageNumber - 1); - stripper.setPageNumber(pageNumber); - stripper.setStartPage(pageNumber); - stripper.setEndPage(pageNumber); - stripper.setPdpage(pdPage); - stripper.getText(pdDocument); + document.setPages(pages); - PDRectangle pdr = pdPage.getMediaBox(); - boolean isLandscape = pdr.getWidth() > pdr.getHeight(); + classificationService.classifyDocument(document); + sectionsBuilderService.buildSections(document); + sectionsBuilderService.addImagesToSections(document); - int rotation = pdPage.getRotation(); - boolean isRotated = rotation != 0 && rotation != 360; + pdDocument = reinitializePDDocument(tempFile, pdDocument); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(stripper.getRulings(), stripper.getMinCharWidth(), stripper - .getMaxCharHeight()); + // This can be improved an done in one pass, but it's complicated to do right away + postProcessSections(pdDocument, document.getSectionText()); - Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings - .getVertical()); - page.setRotation(rotation); - - tableExtractionService.extractTables(cleanRulings, page); - - buildPageStatistics(page); - - page.setLandscape(isLandscape || isRotated); - - page.setPageNumber(pageNumber); - increaseDocumentStatistics(page, document); - - page.setImages(stripper.getImages()); - - imageClassificationService.classifyImages(page); - - pages.add(page); + IOUtils.close(pdDocument); + tempFile.delete(); + return document; + } finally { + if (pdDocument != null) { + pdDocument.close(); + } } - - document.setPages(pages); - - classificationService.classifyDocument(document); - sectionsBuilderService.buildSections(document); - sectionsBuilderService.addImagesToSections(document); - - pdDocument = reinitializePDDocument(tempFile, pdDocument); - - // This can be improved an done in one pass, but it's complicated to do right away - postProcessSections(pdDocument, document.getSectionText()); - - IOUtils.close(pdDocument); - - tempFile.delete(); - - return document; } private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 6d84c905..2a998b14 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -19,7 +19,6 @@ import lombok.SneakyThrows; import org.apache.commons.io.IOUtils; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.KieServices; @@ -452,7 +451,6 @@ public class RedactionIntegrationTest { @Test - @Ignore public void testLargeScannedFileOOM() { AnalyzeRequest request = prepareStorage("scanned/VV-377031.pdf"); MemoryStats.printMemoryStats(); From 42fcea85d30f914fe02e304e69a551da409c6c3b Mon Sep 17 00:00:00 2001 From: Timo Date: Sun, 18 Apr 2021 11:31:33 +0300 Subject: [PATCH 2/2] set image type on error --- .../v1/server/redaction/service/ImageClassificationService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java index ddd945ab..a845af9c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java @@ -12,7 +12,6 @@ import org.springframework.stereotype.Service; import javax.imageio.ImageIO; import java.io.ByteArrayOutputStream; -import java.io.IOException; @Slf4j @Service @@ -37,6 +36,7 @@ public class ImageClassificationService { image.setImageType(ImageType.valueOf(response.getCategory())); } catch (Exception e) { log.error("Could not classify image", e); + image.setImageType(ImageType.OTHER); } log.info("Image classification took: " + (System.currentTimeMillis() - start));