add monitoring with grafana and prometheus

2022-03-15 13:14:48 +01:00 · 2022-03-15 13:14:48 +01:00 · 597043bb38
commit 597043bb38
parent 2117e2a294
3 changed files with 20 additions and 0 deletions
--- a/config.yaml
+++ b/config.yaml
@ -8,6 +8,7 @@ service:
  batch_size: $BATCH_SIZE|2  # Number of images in memory simultaneously
  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
  run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the model from
+  monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not


 # These variables control filters that are applied to either images, image metadata or model predictions. The filter
--- a/requirements.txt
+++ b/requirements.txt
@ -19,3 +19,5 @@ PDFNetPython3~=9.1.0
 Pillow~=8.3.2
 PyYAML~=5.4.1
 scikit_learn~=0.24.2
+prometheus-client==0.13.1
+prometheus_flask_exporter==0.19.0
--- a/src/serve.py
+++ b/src/serve.py
@ -1,5 +1,6 @@
 import logging
 import tempfile
+import tracemalloc

 from flask import Flask, request, jsonify
 from waitress import serve
@ -7,14 +8,21 @@ from waitress import serve
 from image_prediction.config import CONFIG
 from image_prediction.predictor import Predictor, extract_image_metadata_pairs, classify_images
 from image_prediction.response import build_response
+from prometheus_client import Gauge, Counter
+from prometheus_flask_exporter import PrometheusMetrics


 def main():

    predictor = Predictor()
    logging.info("Predictor ready.")
+    tracemalloc.start()

    app = Flask(__name__)
+    metrics = PrometheusMetrics(app=app, path='/prometheus')
+
+    file_counter = Counter("image_prediction_file_counter", "count processed files")
+    ram_metric = Gauge("image_prediction_memory_usage", "Memory usage in Mb")

    @app.route("/ready", methods=["GET"])
    def ready():
@ -29,7 +37,13 @@ def main():
        return resp

    @app.route("/", methods=["POST"])
+    @metrics.summary('image_prediction_request_time_seconds', 'Time spent processing request')
    def predict():
+        def do_monitoring():
+            file_counter.inc()
+            _, peak = tracemalloc.get_traced_memory()
+            ram_metric.set(peak / 10 ** 6)
+
        pdf = request.data

        logging.debug("Running predictor on document...")
@ -38,6 +52,8 @@ def main():
            image_metadata_pairs = extract_image_metadata_pairs(tmp_file.name)
            try:
                predictions, metadata = classify_images(predictor, image_metadata_pairs)
+                if CONFIG.service.monitoring_enabled:
+                    do_monitoring()
            except Exception as err:
                logging.warning("Analysis failed.")
                logging.exception(err)
@ -59,6 +75,7 @@ def run_prediction_server(app, mode="development"):
        app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True)
    elif mode == "production":
        serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
+    tracemalloc.stop()


 if __name__ == "__main__":