diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java index 873ae8a1..42bfa82e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java @@ -30,6 +30,8 @@ public class Page { private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); + private double cropBoxArea; + public boolean isRotated() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java index a845af9c..6a590353 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java @@ -5,6 +5,7 @@ import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClien import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse; import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile; import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -26,7 +27,7 @@ public class ImageClassificationService { page.getImages().forEach(image -> { - if (settings.isEnableImageClassification()) { + if (settings.isEnableImageClassification() && !isEntirePageImage(image, page)) { long start = System.currentTimeMillis(); try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { @@ -38,7 +39,6 @@ public class ImageClassificationService { log.error("Could not classify image", e); image.setImageType(ImageType.OTHER); } - log.info("Image classification took: " + (System.currentTimeMillis() - start)); } else { image.setImageType(ImageType.OTHER); @@ -59,4 +59,13 @@ public class ImageClassificationService { } + private boolean isEntirePageImage(PdfImage image, Page page){ + double imageArea = image.getPosition().getHeight() * image.getPosition().getWidth(); + if(imageArea / page.getCropBoxArea() >= settings.getMaxImageCropboxRatio()){ + log.info("Skipping image classification because images is almost as large as the entire page"); + return true; + } + return false; + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 270f95e1..43f5192f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,5 +1,6 @@ package com.iqser.red.service.redaction.v1.server.segmentation; +import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; @@ -93,6 +94,10 @@ public class PdfSegmentationService { Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings .getVertical()); + PDRectangle cropbox = pdPage.getCropBox(); + float cropboxArea = cropbox.getHeight() * cropbox.getWidth(); + page.setCropBoxArea(cropboxArea); + page.setRotation(rotation); page.setLandscape(isLandscape || isRotated); page.setPageNumber(pageNumber); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java index f7c9f894..3d5b0b5b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java @@ -13,4 +13,6 @@ public class RedactionServiceSettings { private boolean enableImageClassification = true; + private float maxImageCropboxRatio = 0.9f; + }