diff --git a/.gitignore b/.gitignore index 9e95b52..6343b16 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ .vscode/ *.h5 -/venv/ +*venv* .idea/ +src/data !.gitignore *.project @@ -172,4 +173,4 @@ fabric.properties # https://plugins.jetbrains.com/plugin/12206-codestream .idea/codestream.xml -# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm +# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm \ No newline at end of file diff --git a/bamboo-specs/src/main/resources/scripts/docker-build.sh b/bamboo-specs/src/main/resources/scripts/docker-build.sh index 90a11f0..5bc8302 100755 --- a/bamboo-specs/src/main/resources/scripts/docker-build.sh +++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh @@ -22,7 +22,8 @@ then else newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" echo "gitTag=${newVersion}" > git.tag - echo "dev build with tag ${newVersion}" + dev_tag="dev" + echo "dev build with tag $dev_tag" python3 -m venv build_venv source build_venv/bin/activate python3 -m pip install --upgrade pip @@ -34,7 +35,8 @@ else echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 docker build -f Dockerfile_base -t $SERVICE_NAME_BASE . - docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} . + docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag . + docker push nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag exit 0 fi @@ -53,4 +55,4 @@ echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iq docker build -f Dockerfile_base -t $SERVICE_NAME_BASE . docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} . echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 -docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} +docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} \ No newline at end of file diff --git a/config.yaml b/config.yaml index 9bfcaf1..b0e5aa2 100644 --- a/config.yaml +++ b/config.yaml @@ -1,26 +1,24 @@ webserver: - host: $SERVER_HOST|"127.0.0.1" # Webserver address - port: $SERVER_PORT|5000 # Webserver port + host: $SERVER_HOST|"127.0.0.1" # webserver address + port: $SERVER_PORT|5000 # webserver port service: - logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger - verbose: $VERBOSE|True # Service prints document processing progress to stdout - batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously - mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from - + logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger + verbose: $VERBOSE|False # Service DOES NOT prints document processing progress to stdout + batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously + mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from # These variables control filters that are applied to either images, image metadata or service_estimator predictions. # The filter result values are reported in the service responses. For convenience the response to a request contains a # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its # specified required value. filters: - image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas) - min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible - max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible + min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible + max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible - image_width_to_height_quotient: # Image width to height ratio - min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible - max: $MAX_IMAGE_FORMAT|10 # Maximum permissible + image_width_to_height_quotient: # Image width to height ratio + min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible + max: $MAX_IMAGE_FORMAT|10 # Maximum permissible - min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence + min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence diff --git a/image_prediction/__init__.py b/image_prediction/__init__.py index e69de29..315dcb1 100644 --- a/image_prediction/__init__.py +++ b/image_prediction/__init__.py @@ -0,0 +1,13 @@ +import logging +import sys + +# log config +LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s" +DATE_FORMAT = "%Y-%m-%d %H:%M:%S" +stream_handler = logging.StreamHandler(sys.stdout) +stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT) +stream_handler.setFormatter(stream_handler_format) + +logger = logging.getLogger(__name__) +logger.propagate = False +logger.addHandler(stream_handler) diff --git a/image_prediction/pipeline.py b/image_prediction/pipeline.py index 704a88f..2bff17a 100644 --- a/image_prediction/pipeline.py +++ b/image_prediction/pipeline.py @@ -1,7 +1,6 @@ import os -from functools import partial +from functools import lru_cache, partial from itertools import chain, tee -from typing import Iterable from funcy import rcompose, first, compose, second, chunks, identity, rpartial from tqdm import tqdm @@ -20,6 +19,7 @@ from image_prediction.utils.generic import lift, starlift os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" +@lru_cache(maxsize=None) def load_pipeline(**kwargs): model_loader = get_mlflow_model_loader(MLRUNS_DIR) model_identifier = CONFIG.service.mlflow_run_id @@ -38,7 +38,7 @@ def star(f): class Pipeline: - def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs): + def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs): self.verbose = verbose extract = get_extractor(**kwargs) diff --git a/incl/pyinfra b/incl/pyinfra index 64d6a8c..c97ae3d 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 64d6a8cec62eeddf26bd71a9aabc28b40dcec901 +Subproject commit c97ae3d2c242dfc88a342955311dd488cb9a5f60 diff --git a/src/serve.py b/src/serve.py index a865c7d..e123213 100644 --- a/src/serve.py +++ b/src/serve.py @@ -2,6 +2,7 @@ import gzip import json import logging +from image_prediction import logger from image_prediction.config import Config from image_prediction.locations import CONFIG_FILE from image_prediction.pipeline import load_pipeline @@ -14,8 +15,6 @@ from pyinfra.storage.storage import get_storage PYINFRA_CONFIG = config.get_config() IMAGE_CONFIG = Config(CONFIG_FILE) -logging.getLogger().addHandler(logging.StreamHandler()) -logger = logging.getLogger("main") logger.setLevel(PYINFRA_CONFIG.logging_level_root) @@ -28,28 +27,34 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root) def process_request(request_message): dossier_id = request_message["dossierId"] file_id = request_message["fileId"] - logger.info(f"Processing {dossier_id=} {file_id=} ...") target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}" response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}" + logger.info("Processing file %s w/ file_id=%s and dossier_id=%s", target_file_name, file_id, dossier_id) bucket = PYINFRA_CONFIG.storage_bucket storage = get_storage(PYINFRA_CONFIG) + logger.debug("loading model pipeline") pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) - if not storage.exists(bucket, target_file_name): - publish_result = False - else: - publish_result = True + if storage.exists(bucket, target_file_name): + logger.info("fetching file for file_id=%s and dossier_id=%s", file_id, dossier_id) object_bytes = storage.get_object(bucket, target_file_name) object_bytes = gzip.decompress(object_bytes) + classifications = list(pipeline(pdf=object_bytes)) + logger.info("predictions ready for file_id=%s and dossier_id=%s", file_id, dossier_id) result = {**request_message, "data": classifications} storage_bytes = gzip.compress(json.dumps(result).encode("utf-8")) + + logger.info("storing predictions for file_id=%s and dossier_id=%s", file_id, dossier_id) storage.put_object(bucket, response_file_name, storage_bytes) - return publish_result, {"dossierId": dossier_id, "fileId": file_id} + return {"dossierId": dossier_id, "fileId": file_id} + else: + logger.info("no files found for file_id=%s and dossier_id=%s", file_id, dossier_id) + return None def main():