added response formatter and pipeline test

2022-03-31 19:01:32 +02:00 · 2022-03-31 19:01:32 +02:00 · 5caa9807e2
commit 5caa9807e2
parent 82added50a
12 changed files with 156 additions and 14 deletions
--- a/image_prediction/response.py
+++ b/image_prediction/response.py
--- a/image_prediction/default_objects.py
+++ b/image_prediction/default_objects.py
@ -11,10 +11,10 @@ def get_mlflow_model_loader(mlruns_dir):
    return model_loader


-def load_pipeline():
+def load_pipeline(**kwargs):
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.run_id

-    pipeline = Pipeline(model_loader, model_identifier)
+    pipeline = Pipeline(model_loader, model_identifier, **kwargs)

    return pipeline
--- a/image_prediction/extractor_classifier/extractor_classifier.py
+++ b/image_prediction/extractor_classifier/extractor_classifier.py
@ -18,7 +18,7 @@ class ExtractorClassifier:
        images, metadata = zip(*batch)

        predictions = self.classifier(images)
-        responses = ({"prediction": prd, **mdt} for prd, mdt in zip(predictions, metadata))
+        responses = ({"classification": prd, **mdt} for prd, mdt in zip(predictions, metadata))
        return responses

    def __call__(self, obj) -> Iterable[ImageMetadataPair]:
--- a/image_prediction/formatter/formatters/response.py
+++ b/image_prediction/formatter/formatters/response.py
@ -0,0 +1,72 @@
+import math
+from operator import itemgetter
+
+from image_prediction.config import CONFIG
+from image_prediction.transformer.transformer import Transformer
+
+
+class ResponseTransformer(Transformer):
+
+    def transform(self, data):
+        try:
+            return build_image_info(data)
+        except TypeError:
+            return map(build_image_info, data)
+
+
+def build_image_info(data: dict) -> dict:
+    def compute_geometric_quotient():
+        page_area_sqrt = math.sqrt(abs(page_width * page_height))
+        image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
+        return image_area_sqrt / page_area_sqrt
+
+    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
+        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
+    )(data)
+
+    quotient = round(compute_geometric_quotient(), 4)
+
+    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
+    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
+    min_image_width_to_height_quotient_breached = bool(
+        width / height < CONFIG.filters.image_width_to_height_quotient.min
+    )
+    max_image_width_to_height_quotient_breached = bool(
+        width / height > CONFIG.filters.image_width_to_height_quotient.max
+    )
+
+    classification = data["classification"]
+
+    min_confidence_breached = bool(max(classification["probabilities"].values()) < CONFIG.filters.min_confidence)
+
+    image_info = {
+        "classification": classification,
+        "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
+        "geometry": {"width": width, "height": height},
+        "filters": {
+            "geometry": {
+                "imageSize": {
+                    "quotient": quotient,
+                    "tooLarge": max_image_to_page_quotient_breached,
+                    "tooSmall": min_image_to_page_quotient_breached,
+                },
+                "imageFormat": {
+                    "quotient": round(width / height, 4),
+                    "tooTall": min_image_width_to_height_quotient_breached,
+                    "tooWide": max_image_width_to_height_quotient_breached,
+                },
+            },
+            "probability": {"unconfident": min_confidence_breached},
+            "allPassed": not any(
+                [
+                    max_image_to_page_quotient_breached,
+                    min_image_to_page_quotient_breached,
+                    min_image_width_to_height_quotient_breached,
+                    max_image_width_to_height_quotient_breached,
+                    min_confidence_breached,
+                ]
+            ),
+        },
+    }
+
+    return image_info
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -7,3 +7,5 @@ CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")

 DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
 MLRUNS_DIR = path.join(DATA_DIR, "mlruns")
+
+TEST_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "test", "data")
--- a/image_prediction/pipeline.py
+++ b/image_prediction/pipeline.py
@ -4,9 +4,12 @@ from funcy import rcompose, juxt

 from image_prediction.classifier.classifier import Classifier
 from image_prediction.classifier.image_classifier import ImageClassifier
+from image_prediction.compositor.compositor import TransformerCompositor
 from image_prediction.estimator.adapter.adapter import EstimatorAdapter
 from image_prediction.extractor_classifier.extractor_classifier import ExtractorClassifier
+from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
 from image_prediction.formatter.formatters.enum import EnumFormatter
+from image_prediction.formatter.formatters.response import ResponseTransformer
 from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
 from image_prediction.label_mapper.mappers.probability import ProbabilityMapper

@ -18,27 +21,29 @@ def get_image_classifier(model_loader, model_identifier):
    return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))


-def get_extractor():
-    image_extractor = ParsablePDFImageExtractor(verbose=True)
+def get_extractor(**kwargs):
+    image_extractor = ParsablePDFImageExtractor(**kwargs)

    return image_extractor


-def get_extractor_classifier(model_loader, model_identifier):
-    extractor_classifier = ExtractorClassifier(get_extractor(), get_image_classifier(model_loader, model_identifier))
+def get_extractor_classifier(model_loader, model_identifier, **kwargs):
+    extractor_classifier = ExtractorClassifier(
+        get_extractor(**kwargs), get_image_classifier(model_loader, model_identifier)
+    )

    return extractor_classifier


 def get_formatter():
-    formatter = EnumFormatter()
+    formatter = TransformerCompositor(EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter())

    return formatter


 class Pipeline:
-    def __init__(self, model_loader, model_identifier):
-        self.pipe = rcompose(get_extractor_classifier(model_loader, model_identifier), get_formatter())
+    def __init__(self, model_loader, model_identifier, **kwargs):
+        self.pipe = rcompose(get_extractor_classifier(model_loader, model_identifier, **kwargs), get_formatter())

    def __call__(self, pdf: bytes):
        yield from self.pipe(pdf)
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -19,8 +19,11 @@ def main(args):
    with open(args.pdf, "rb") as f:
        predictions = pipeline(f.read())

+    with open("/tmp/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json", "w") as f:
+        json.dump(list(predictions), f, indent=2)
+
    for prd in predictions:
-        print(json.dumps(prd, indent=1))
+        print(json.dumps(prd, indent=2))


 if __name__ == "__main__":
--- a/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf
+++ b/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf
--- a/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json
+++ b/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json
@ -0,0 +1,42 @@
+[
+  {
+    "classification": {
+      "label": "formula",
+      "probabilities": {
+        "formula": 1.0,
+        "logo": 0.0,
+        "other": 0.0,
+        "signature": 0.0
+      }
+    },
+    "position": {
+      "x1": 321,
+      "x2": 515,
+      "y1": 300,
+      "y2": 494,
+      "pageNumber": 2
+    },
+    "geometry": {
+      "width": 389,
+      "height": 389
+    },
+    "filters": {
+      "geometry": {
+        "imageSize": {
+          "quotient": 0.2741,
+          "tooLarge": false,
+          "tooSmall": false
+        },
+        "imageFormat": {
+          "quotient": 1.0,
+          "tooTall": false,
+          "tooWide": false
+        }
+      },
+      "probability": {
+        "unconfident": false
+      },
+      "allPassed": true
+    }
+  }
+]
--- a/test/unit_tests/conftest.py
+++ b/test/unit_tests/conftest.py
@ -227,13 +227,13 @@ def map_labels(numeric_labels, classes):

@pytest.fixture
 def metadata_plus_mapped_prediction(expected_predictions_mapped, metadata):
-    return [{"prediction": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped, metadata)]
+    return [{"classification": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped, metadata)]


@pytest.fixture
 def metadata_formatted_plus_mapped_prediction_formatted(expected_predictions_mapped_and_formatted, metadata_formatted):
    return [
-        {"prediction": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped_and_formatted, metadata_formatted)
+        {"classification": epm,  **mdt} for epm, mdt in zip(expected_predictions_mapped_and_formatted, metadata_formatted)
    ]


--- a/test/unit_tests/extractor_classifier_test.py
+++ b/test/unit_tests/extractor_classifier_test.py
@ -10,5 +10,5 @@ from image_prediction.extractor_classifier.extractor_classifier import Extractor
 def test_extractor_classifier(image_extractor, image_classifier, images, batch_of_expected_string_labels):
    extractor_classifier = ExtractorClassifier(image_extractor, image_classifier)
    results = extractor_classifier(images)
-    labels = list(map(itemgetter("prediction"), results))
+    labels = list(map(itemgetter("classification"), results))
    assert labels == batch_of_expected_string_labels
--- a/test/unit_tests/pipeline_test.py
+++ b/test/unit_tests/pipeline_test.py
@ -0,0 +1,18 @@
+import json
+import os
+
+from image_prediction.default_objects import load_pipeline
+from image_prediction.locations import TEST_DATA_DIR
+
+
+def test_pipeline():
+
+    pipeline = load_pipeline(verbose=False)
+
+    with open(os.path.join(TEST_DATA_DIR, "f2dc689ca794fccb8cd38b95f2bf6ba9.pdf"), "rb") as f:
+        predictions = list(pipeline(f.read()))
+
+    with open(os.path.join(TEST_DATA_DIR, "f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json"), "r") as f:
+        expectations = json.load(f)
+
+    assert predictions == expectations