Pull request #157: RED-1390: Do not classify images that are nearly as big as the page
Merge in RED/redaction-service from RED-1390 to master * commit '13974d13734cc9c43e44338bef885a1ace2e5f50': RED-1390: Do not classify images that are nearly as big as the page
This commit is contained in:
commit
8ece616229
@ -30,6 +30,8 @@ public class Page {
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
|
||||
private double cropBoxArea;
|
||||
|
||||
|
||||
public boolean isRotated() {
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClien
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
|
||||
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -26,7 +27,7 @@ public class ImageClassificationService {
|
||||
|
||||
page.getImages().forEach(image -> {
|
||||
|
||||
if (settings.isEnableImageClassification()) {
|
||||
if (settings.isEnableImageClassification() && !isEntirePageImage(image, page)) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
@ -38,7 +39,6 @@ public class ImageClassificationService {
|
||||
log.error("Could not classify image", e);
|
||||
image.setImageType(ImageType.OTHER);
|
||||
}
|
||||
|
||||
log.info("Image classification took: " + (System.currentTimeMillis() - start));
|
||||
} else {
|
||||
image.setImageType(ImageType.OTHER);
|
||||
@ -59,4 +59,13 @@ public class ImageClassificationService {
|
||||
|
||||
}
|
||||
|
||||
private boolean isEntirePageImage(PdfImage image, Page page){
|
||||
double imageArea = image.getPosition().getHeight() * image.getPosition().getWidth();
|
||||
if(imageArea / page.getCropBoxArea() >= settings.getMaxImageCropboxRatio()){
|
||||
log.info("Skipping image classification because images is almost as large as the entire page");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
@ -93,6 +94,10 @@ public class PdfSegmentationService {
|
||||
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings
|
||||
.getVertical());
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
float cropboxArea = cropbox.getHeight() * cropbox.getWidth();
|
||||
page.setCropBoxArea(cropboxArea);
|
||||
|
||||
page.setRotation(rotation);
|
||||
page.setLandscape(isLandscape || isRotated);
|
||||
page.setPageNumber(pageNumber);
|
||||
|
||||
@ -13,4 +13,6 @@ public class RedactionServiceSettings {
|
||||
|
||||
private boolean enableImageClassification = true;
|
||||
|
||||
private float maxImageCropboxRatio = 0.9f;
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user