added response formatter and pipeline test

2022-03-31 19:01:32 +02:00 · 2022-03-31 19:01:32 +02:00 · 5caa9807e2
commit 5caa9807e2
parent 82added50a
12 changed files with 156 additions and 14 deletions
--- a/image_prediction/response.py
+++ b/image_prediction/response.py
--- a/image_prediction/default_objects.py
+++ b/image_prediction/default_objects.py
@ -11,10 +11,10 @@ def get_mlflow_model_loader(mlruns_dir):
    return model_loader
-def load_pipeline():
+def load_pipeline(**kwargs):
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.run_id
-    pipeline = Pipeline(model_loader, model_identifier)
+    pipeline = Pipeline(model_loader, model_identifier, **kwargs)
    return pipeline
--- a/image_prediction/extractor_classifier/extractor_classifier.py
+++ b/image_prediction/extractor_classifier/extractor_classifier.py
@ -18,7 +18,7 @@ class ExtractorClassifier:
        images, metadata = zip(*batch)
        predictions = self.classifier(images)
-        responses = ({"prediction": prd, **mdt} for prd, mdt in zip(predictions, metadata))
+        responses = ({"classification": prd, **mdt} for prd, mdt in zip(predictions, metadata))
        return responses
    def __call__(self, obj) -> Iterable[ImageMetadataPair]:
--- a/image_prediction/formatter/formatters/response.py
+++ b/image_prediction/formatter/formatters/response.py
@ -0,0 +1,72 @@
 import math
 from operator import itemgetter
 from image_prediction.config import CONFIG
 from image_prediction.transformer.transformer import Transformer
 class ResponseTransformer(Transformer):
    def transform(self, data):
        try:
            return build_image_info(data)
        except TypeError:
            return map(build_image_info, data)
 def build_image_info(data: dict) -> dict:
    def compute_geometric_quotient():
        page_area_sqrt = math.sqrt(abs(page_width * page_height))
        image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
        return image_area_sqrt / page_area_sqrt
    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
    )(data)
    quotient = round(compute_geometric_quotient(), 4)
    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
    min_image_width_to_height_quotient_breached = bool(
        width / height < CONFIG.filters.image_width_to_height_quotient.min
    )
    max_image_width_to_height_quotient_breached = bool(
        width / height > CONFIG.filters.image_width_to_height_quotient.max
    )
    classification = data["classification"]
    min_confidence_breached = bool(max(classification["probabilities"].values()) < CONFIG.filters.min_confidence)
    image_info = {
        "classification": classification,
        "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
        "geometry": {"width": width, "height": height},
        "filters": {
            "geometry": {
                "imageSize": {
                    "quotient": quotient,
                    "tooLarge": max_image_to_page_quotient_breached,
                    "tooSmall": min_image_to_page_quotient_breached,
                },
                "imageFormat": {
                    "quotient": round(width / height, 4),
                    "tooTall": min_image_width_to_height_quotient_breached,
                    "tooWide": max_image_width_to_height_quotient_breached,
                },
            },
            "probability": {"unconfident": min_confidence_breached},
            "allPassed": not any(
                [
                    max_image_to_page_quotient_breached,
                    min_image_to_page_quotient_breached,
                    min_image_width_to_height_quotient_breached,
                    max_image_width_to_height_quotient_breached,
                    min_confidence_breached,
                ]
            ),
        },
    }
    return image_info
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -7,3 +7,5 @@ CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
 DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
 MLRUNS_DIR = path.join(DATA_DIR, "mlruns")
 TEST_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "test", "data")
--- a/image_prediction/pipeline.py
+++ b/image_prediction/pipeline.py
@ -4,9 +4,12 @@ from funcy import rcompose, juxt
 from image_prediction.classifier.classifier import Classifier
 from image_prediction.classifier.image_classifier import ImageClassifier
 from image_prediction.compositor.compositor import TransformerCompositor
 from image_prediction.estimator.adapter.adapter import EstimatorAdapter
 from image_prediction.extractor_classifier.extractor_classifier import ExtractorClassifier
 from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.formatter.formatters.response import ResponseTransformer
 from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
 from image_prediction.label_mapper.mappers.probability import ProbabilityMapper
@ -18,27 +21,29 @@ def get_image_classifier(model_loader, model_identifier):
    return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
-def get_extractor():
+def get_extractor(**kwargs):
-    image_extractor = ParsablePDFImageExtractor(verbose=True)
+    image_extractor = ParsablePDFImageExtractor(**kwargs)
    return image_extractor
-def get_extractor_classifier(model_loader, model_identifier):
+def get_extractor_classifier(model_loader, model_identifier, **kwargs):
-    extractor_classifier = ExtractorClassifier(get_extractor(), get_image_classifier(model_loader, model_identifier))
+    extractor_classifier = ExtractorClassifier(
        get_extractor(**kwargs), get_image_classifier(model_loader, model_identifier)
    )
    return extractor_classifier
 def get_formatter():
-    formatter = EnumFormatter()
+    formatter = TransformerCompositor(EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter())
    return formatter
 class Pipeline:
-    def __init__(self, model_loader, model_identifier):
+    def __init__(self, model_loader, model_identifier, **kwargs):
-        self.pipe = rcompose(get_extractor_classifier(model_loader, model_identifier), get_formatter())
+        self.pipe = rcompose(get_extractor_classifier(model_loader, model_identifier, **kwargs), get_formatter())
    def __call__(self, pdf: bytes):
        yield from self.pipe(pdf)
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -19,8 +19,11 @@ def main(args):
    with open(args.pdf, "rb") as f:
        predictions = pipeline(f.read())
    with open("/tmp/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json", "w") as f:
        json.dump(list(predictions), f, indent=2)
    for prd in predictions:
-        print(json.dumps(prd, indent=1))
+        print(json.dumps(prd, indent=2))
 if __name__ == "__main__":
--- a/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf
+++ b/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf
--- a/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json
+++ b/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json
@ -0,0 +1,42 @@
 [
  {
    "classification": {
      "label": "formula",
      "probabilities": {
        "formula": 1.0,
        "logo": 0.0,
        "other": 0.0,
        "signature": 0.0
      }
    },
    "position": {
      "x1": 321,
      "x2": 515,
      "y1": 300,
      "y2": 494,
      "pageNumber": 2
    },
    "geometry": {
      "width": 389,
      "height": 389
    },
    "filters": {
      "geometry": {
        "imageSize": {
          "quotient": 0.2741,
          "tooLarge": false,
          "tooSmall": false
        },
        "imageFormat": {
          "quotient": 1.0,
          "tooTall": false,
          "tooWide": false
        }
      },
      "probability": {
        "unconfident": false
      },
      "allPassed": true
    }
  }
 ]
--- a/test/unit_tests/conftest.py
+++ b/test/unit_tests/conftest.py
@ -227,13 +227,13 @@ def map_labels(numeric_labels, classes):
@pytest.fixture
 def metadata_plus_mapped_prediction(expected_predictions_mapped, metadata):
-    return [{"prediction": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped, metadata)]
+    return [{"classification": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped, metadata)]
@pytest.fixture
 def metadata_formatted_plus_mapped_prediction_formatted(expected_predictions_mapped_and_formatted, metadata_formatted):
    return [
-        {"prediction": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped_and_formatted, metadata_formatted)
+        {"classification": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped_and_formatted, metadata_formatted)
    ]
--- a/test/unit_tests/extractor_classifier_test.py
+++ b/test/unit_tests/extractor_classifier_test.py
@ -10,5 +10,5 @@ from image_prediction.extractor_classifier.extractor_classifier import Extractor
 def test_extractor_classifier(image_extractor, image_classifier, images, batch_of_expected_string_labels):
    extractor_classifier = ExtractorClassifier(image_extractor, image_classifier)
    results = extractor_classifier(images)
-    labels = list(map(itemgetter("prediction"), results))
+    labels = list(map(itemgetter("classification"), results))
    assert labels == batch_of_expected_string_labels
--- a/test/unit_tests/pipeline_test.py
+++ b/test/unit_tests/pipeline_test.py
@ -0,0 +1,18 @@
 import json
 import os
 from image_prediction.default_objects import load_pipeline
 from image_prediction.locations import TEST_DATA_DIR
 def test_pipeline():
    pipeline = load_pipeline(verbose=False)
    with open(os.path.join(TEST_DATA_DIR, "f2dc689ca794fccb8cd38b95f2bf6ba9.pdf"), "rb") as f:
        predictions = list(pipeline(f.read()))
    with open(os.path.join(TEST_DATA_DIR, "f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json"), "r") as f:
        expectations = json.load(f)
    assert predictions == expectations