Pull request #157: RED-1390: Do not classify images that are nearly as big as the page

Merge in RED/redaction-service from RED-1390 to master

* commit '13974d13734cc9c43e44338bef885a1ace2e5f50':
  RED-1390: Do not classify images that are nearly as big as the page
This commit is contained in:
Dominique Eiflaender 2021-05-19 12:03:43 +02:00
commit 8ece616229
4 changed files with 20 additions and 2 deletions

View File

@ -30,6 +30,8 @@ public class Page {
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private double cropBoxArea;
public boolean isRotated() {

View File

@ -5,6 +5,7 @@ import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClien
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -26,7 +27,7 @@ public class ImageClassificationService {
page.getImages().forEach(image -> {
if (settings.isEnableImageClassification()) {
if (settings.isEnableImageClassification() && !isEntirePageImage(image, page)) {
long start = System.currentTimeMillis();
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
@ -38,7 +39,6 @@ public class ImageClassificationService {
log.error("Could not classify image", e);
image.setImageType(ImageType.OTHER);
}
log.info("Image classification took: " + (System.currentTimeMillis() - start));
} else {
image.setImageType(ImageType.OTHER);
@ -59,4 +59,13 @@ public class ImageClassificationService {
}
private boolean isEntirePageImage(PdfImage image, Page page){
double imageArea = image.getPosition().getHeight() * image.getPosition().getWidth();
if(imageArea / page.getCropBoxArea() >= settings.getMaxImageCropboxRatio()){
log.info("Skipping image classification because images is almost as large as the entire page");
return true;
}
return false;
}
}

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -93,6 +94,10 @@ public class PdfSegmentationService {
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
.getVertical());
PDRectangle cropbox = pdPage.getCropBox();
float cropboxArea = cropbox.getHeight() * cropbox.getWidth();
page.setCropBoxArea(cropboxArea);
page.setRotation(rotation);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);

View File

@ -13,4 +13,6 @@ public class RedactionServiceSettings {
private boolean enableImageClassification = true;
private float maxImageCropboxRatio = 0.9f;
}