Pull request #42: Bugfix/RED-5277 heartbeat

Merge in RR/image-prediction from bugfix/RED-5277-heartbeat to master * commit '846f127d3ba75c1be124ddc780a4f9c849dc84af': update reference fix type remove commented out code import logger from `__init__.py` add log config to `__init__.py` remove extra stream handler update reference update refrence update reference update reference update reference build dev image and push to nexus add logging & only return one object from `process_request()` cache loaded pipeline & disable tqdm output by default format + set verbose to False by default update
2023-02-16 09:54:07 +01:00 · 2023-02-16 09:54:07 +01:00 · 2001e9d7f3
commit 2001e9d7f3
parent 463f4da92b 846f127d3b
7 changed files with 50 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,8 @@
 .vscode/
 *.h5
-/venv/
+*venv*
 .idea/
+src/data

 !.gitignore
 *.project
@ -172,4 +173,4 @@ fabric.properties
 # https://plugins.jetbrains.com/plugin/12206-codestream
 .idea/codestream.xml

-# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -22,7 +22,8 @@ then
 else
    newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
    echo "gitTag=${newVersion}" > git.tag
-    echo "dev build with tag ${newVersion}"
+    dev_tag="dev"
+    echo "dev build with tag $dev_tag"
    python3 -m venv build_venv
    source build_venv/bin/activate
    python3 -m pip install --upgrade pip
@ -34,7 +35,8 @@ else
    echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
    echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
    docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
-    docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
+    docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag .
+    docker push nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag
    exit 0
 fi

@ -53,4 +55,4 @@ echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iq
 docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
 docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
 echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
-docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}
+docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}
--- a/config.yaml
+++ b/config.yaml
@ -1,26 +1,24 @@
 webserver:
-  host: $SERVER_HOST|"127.0.0.1"  # Webserver address
-  port: $SERVER_PORT|5000  # Webserver port
+  host: $SERVER_HOST|"127.0.0.1" # webserver address
+  port: $SERVER_PORT|5000 # webserver port

 service:
-  logging_level: $LOGGING_LEVEL_ROOT|INFO  # Logging level for service logger
-  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
-  batch_size: $BATCH_SIZE|16  # Number of images in memory simultaneously
-  mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the service_estimator from
-
+  logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
+  verbose: $VERBOSE|False # Service DOES NOT prints document processing progress to stdout
+  batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
+  mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from

 # These variables control filters that are applied to either images, image metadata or service_estimator predictions.
 # The filter result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
 # specified required value.
 filters:
-
  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
-    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
-    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
+    min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
+    max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible

-  image_width_to_height_quotient:  # Image width to height ratio
-    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
-    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
+  image_width_to_height_quotient: # Image width to height ratio
+    min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
+    max: $MAX_IMAGE_FORMAT|10 # Maximum permissible

-  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
+  min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence
--- a/image_prediction/init.py
+++ b/image_prediction/init.py
@ -0,0 +1,13 @@
+import logging
+import sys
+
+# log config
+LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s"
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+stream_handler = logging.StreamHandler(sys.stdout)
+stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
+stream_handler.setFormatter(stream_handler_format)
+
+logger = logging.getLogger(__name__)
+logger.propagate = False
+logger.addHandler(stream_handler)
--- a/image_prediction/pipeline.py
+++ b/image_prediction/pipeline.py
@ -1,7 +1,6 @@
 import os
-from functools import partial
+from functools import lru_cache, partial
 from itertools import chain, tee
-from typing import Iterable

 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
 from tqdm import tqdm
@ -20,6 +19,7 @@ from image_prediction.utils.generic import lift, starlift
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


+@lru_cache(maxsize=None)
 def load_pipeline(**kwargs):
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.mlflow_run_id
@ -38,7 +38,7 @@ def star(f):


 class Pipeline:
-    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
+    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
        self.verbose = verbose

        extract = get_extractor(**kwargs)
--- a/incl/pyinfra
+++ b/incl/pyinfra
@ -1 +1 @@
-Subproject commit 64d6a8cec62eeddf26bd71a9aabc28b40dcec901
+Subproject commit c97ae3d2c242dfc88a342955311dd488cb9a5f60
--- a/src/serve.py
+++ b/src/serve.py
@ -2,6 +2,7 @@ import gzip
 import json
 import logging

+from image_prediction import logger
 from image_prediction.config import Config
 from image_prediction.locations import CONFIG_FILE
 from image_prediction.pipeline import load_pipeline
@ -14,8 +15,6 @@ from pyinfra.storage.storage import get_storage
 PYINFRA_CONFIG = config.get_config()
 IMAGE_CONFIG = Config(CONFIG_FILE)

-logging.getLogger().addHandler(logging.StreamHandler())
-logger = logging.getLogger("main")
 logger.setLevel(PYINFRA_CONFIG.logging_level_root)


@ -28,28 +27,34 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root)
 def process_request(request_message):
    dossier_id = request_message["dossierId"]
    file_id = request_message["fileId"]
-    logger.info(f"Processing {dossier_id=} {file_id=} ...")
    target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}"
    response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}"
+    logger.info("Processing file %s w/ file_id=%s and dossier_id=%s", target_file_name, file_id, dossier_id)

    bucket = PYINFRA_CONFIG.storage_bucket
    storage = get_storage(PYINFRA_CONFIG)

+    logger.debug("loading model pipeline")
    pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size)

-    if not storage.exists(bucket, target_file_name):
-        publish_result = False
-    else:
-        publish_result = True
+    if storage.exists(bucket, target_file_name):
+        logger.info("fetching file for file_id=%s and dossier_id=%s", file_id, dossier_id)
        object_bytes = storage.get_object(bucket, target_file_name)
        object_bytes = gzip.decompress(object_bytes)
+
        classifications = list(pipeline(pdf=object_bytes))
+        logger.info("predictions ready for file_id=%s and dossier_id=%s", file_id, dossier_id)

        result = {**request_message, "data": classifications}
        storage_bytes = gzip.compress(json.dumps(result).encode("utf-8"))
+
+        logger.info("storing predictions for file_id=%s and dossier_id=%s", file_id, dossier_id)
        storage.put_object(bucket, response_file_name, storage_bytes)

-    return publish_result, {"dossierId": dossier_id, "fileId": file_id}
+        return {"dossierId": dossier_id, "fileId": file_id}
+    else:
+        logger.info("no files found for file_id=%s and dossier_id=%s", file_id, dossier_id)
+        return None


 def main():