From 54b7ba24e86ac76c4790718c120ac3d4b558b168 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 16:25:49 +0100 Subject: [PATCH 01/16] update --- .gitignore | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9e95b52..6343b16 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ .vscode/ *.h5 -/venv/ +*venv* .idea/ +src/data !.gitignore *.project @@ -172,4 +173,4 @@ fabric.properties # https://plugins.jetbrains.com/plugin/12206-codestream .idea/codestream.xml -# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm +# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm \ No newline at end of file From b5dc5aa777e9a44ea5d6da58460a9d05bcf5981f Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 16:26:24 +0100 Subject: [PATCH 02/16] format + set verbose to False by default --- config.yaml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/config.yaml b/config.yaml index 9bfcaf1..b0e5aa2 100644 --- a/config.yaml +++ b/config.yaml @@ -1,26 +1,24 @@ webserver: - host: $SERVER_HOST|"127.0.0.1" # Webserver address - port: $SERVER_PORT|5000 # Webserver port + host: $SERVER_HOST|"127.0.0.1" # webserver address + port: $SERVER_PORT|5000 # webserver port service: - logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger - verbose: $VERBOSE|True # Service prints document processing progress to stdout - batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously - mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from - + logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger + verbose: $VERBOSE|False # Service DOES NOT prints document processing progress to stdout + batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously + mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from # These variables control filters that are applied to either images, image metadata or service_estimator predictions. # The filter result values are reported in the service responses. For convenience the response to a request contains a # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its # specified required value. filters: - image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas) - min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible - max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible + min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible + max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible - image_width_to_height_quotient: # Image width to height ratio - min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible - max: $MAX_IMAGE_FORMAT|10 # Maximum permissible + image_width_to_height_quotient: # Image width to height ratio + min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible + max: $MAX_IMAGE_FORMAT|10 # Maximum permissible - min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence + min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence From d239368d703b9fc64f9e8f3f34b6fc9e127b1393 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 16:27:21 +0100 Subject: [PATCH 03/16] cache loaded pipeline & disable tqdm output by default --- image_prediction/pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/image_prediction/pipeline.py b/image_prediction/pipeline.py index 704a88f..2bff17a 100644 --- a/image_prediction/pipeline.py +++ b/image_prediction/pipeline.py @@ -1,7 +1,6 @@ import os -from functools import partial +from functools import lru_cache, partial from itertools import chain, tee -from typing import Iterable from funcy import rcompose, first, compose, second, chunks, identity, rpartial from tqdm import tqdm @@ -20,6 +19,7 @@ from image_prediction.utils.generic import lift, starlift os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" +@lru_cache(maxsize=None) def load_pipeline(**kwargs): model_loader = get_mlflow_model_loader(MLRUNS_DIR) model_identifier = CONFIG.service.mlflow_run_id @@ -38,7 +38,7 @@ def star(f): class Pipeline: - def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs): + def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs): self.verbose = verbose extract = get_extractor(**kwargs) From 9065ec1d1275105cb5a0cd589c5653bb985dd3d7 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 16:29:04 +0100 Subject: [PATCH 04/16] add logging & only return one object from `process_request()` --- src/serve.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/serve.py b/src/serve.py index a865c7d..d47fe5b 100644 --- a/src/serve.py +++ b/src/serve.py @@ -28,28 +28,34 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root) def process_request(request_message): dossier_id = request_message["dossierId"] file_id = request_message["fileId"] - logger.info(f"Processing {dossier_id=} {file_id=} ...") target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}" response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}" + logger.info("Processing file %s w/ file_id=%s, and daossier_id=%s", target_file_name, file_id, dossier_id) bucket = PYINFRA_CONFIG.storage_bucket storage = get_storage(PYINFRA_CONFIG) + logger.debug("loading model pipeline") pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) - if not storage.exists(bucket, target_file_name): - publish_result = False - else: - publish_result = True + if storage.exists(bucket, target_file_name): + logger.info("fetching file for file_id=%s, and daossier_id=%s", file_id, dossier_id) object_bytes = storage.get_object(bucket, target_file_name) object_bytes = gzip.decompress(object_bytes) + classifications = list(pipeline(pdf=object_bytes)) + logger.info("predictions ready for file_id=%s, and daossier_id=%s", file_id, dossier_id) result = {**request_message, "data": classifications} storage_bytes = gzip.compress(json.dumps(result).encode("utf-8")) + + logger.info("storing predictions for file_id=%s, and daossier_id=%s", file_id, dossier_id) storage.put_object(bucket, response_file_name, storage_bytes) - return publish_result, {"dossierId": dossier_id, "fileId": file_id} + return {"dossierId": dossier_id, "fileId": file_id} + else: + logger.info("no files found for file_id=%s, and daossier_id=%s", file_id, dossier_id) + return None def main(): From 4bcadcd26690a86ef19e974490aa44a14cbbaeca Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 16:30:18 +0100 Subject: [PATCH 05/16] build dev image and push to nexus --- bamboo-specs/src/main/resources/scripts/docker-build.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bamboo-specs/src/main/resources/scripts/docker-build.sh b/bamboo-specs/src/main/resources/scripts/docker-build.sh index 90a11f0..5bc8302 100755 --- a/bamboo-specs/src/main/resources/scripts/docker-build.sh +++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh @@ -22,7 +22,8 @@ then else newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" echo "gitTag=${newVersion}" > git.tag - echo "dev build with tag ${newVersion}" + dev_tag="dev" + echo "dev build with tag $dev_tag" python3 -m venv build_venv source build_venv/bin/activate python3 -m pip install --upgrade pip @@ -34,7 +35,8 @@ else echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 docker build -f Dockerfile_base -t $SERVICE_NAME_BASE . - docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} . + docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag . + docker push nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag exit 0 fi @@ -53,4 +55,4 @@ echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iq docker build -f Dockerfile_base -t $SERVICE_NAME_BASE . docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} . echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 -docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} +docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} \ No newline at end of file From ecc9f69d9c9f9a87ea62a9737bc0b74c0223794b Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 16:52:56 +0100 Subject: [PATCH 06/16] update reference --- incl/pyinfra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/incl/pyinfra b/incl/pyinfra index 64d6a8c..3fd3eb2 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 64d6a8cec62eeddf26bd71a9aabc28b40dcec901 +Subproject commit 3fd3eb255c252d1e208b88b475ec8a07c521619d From 29c76e7ebf1a5041183aa02a1c2285c79dda4d00 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Tue, 14 Feb 2023 18:02:09 +0100 Subject: [PATCH 07/16] update reference --- incl/pyinfra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/incl/pyinfra b/incl/pyinfra index 3fd3eb2..cf057da 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 3fd3eb255c252d1e208b88b475ec8a07c521619d +Subproject commit cf057daed23d5f5b0f6f3a1a31e956e015e86368 From c1449134ecbc49bf5d97e33b650d17f132987f28 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 10:23:27 +0100 Subject: [PATCH 08/16] update reference --- incl/pyinfra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/incl/pyinfra b/incl/pyinfra index cf057da..ee11e01 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit cf057daed23d5f5b0f6f3a1a31e956e015e86368 +Subproject commit ee11e018efdbc63a740008e7fa2415cbb12476ae From 7dca05a53d9354d2eb999496eb7e66027caf4400 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 11:11:23 +0100 Subject: [PATCH 09/16] update refrence --- incl/pyinfra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/incl/pyinfra b/incl/pyinfra index ee11e01..adb35db 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit ee11e018efdbc63a740008e7fa2415cbb12476ae +Subproject commit adb35db6fa6daf4b79263a918716c34905e8b3bc From d505ac4e5049421aaecc1825e24a5b404a009028 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 15:01:29 +0100 Subject: [PATCH 10/16] update reference --- incl/pyinfra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/incl/pyinfra b/incl/pyinfra index adb35db..9e139e7 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit adb35db6fa6daf4b79263a918716c34905e8b3bc +Subproject commit 9e139e79e46c52014986f9afb2c6534281b55c10 From 0bdf5a726a73ca5424332a8f1c901b794398ba04 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 15:25:13 +0100 Subject: [PATCH 11/16] remove extra stream handler --- src/serve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/serve.py b/src/serve.py index d47fe5b..60b86ca 100644 --- a/src/serve.py +++ b/src/serve.py @@ -14,8 +14,8 @@ from pyinfra.storage.storage import get_storage PYINFRA_CONFIG = config.get_config() IMAGE_CONFIG = Config(CONFIG_FILE) -logging.getLogger().addHandler(logging.StreamHandler()) -logger = logging.getLogger("main") +# logging.getLogger().addHandler(logging.StreamHandler()) +logger = logging.getLogger(__name__) logger.setLevel(PYINFRA_CONFIG.logging_level_root) From c1ae8e6a4b00c7f68c0078a7936707ef3ddb2c59 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 15:44:56 +0100 Subject: [PATCH 12/16] add log config to `__init__.py` --- image_prediction/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/image_prediction/__init__.py b/image_prediction/__init__.py index e69de29..315dcb1 100644 --- a/image_prediction/__init__.py +++ b/image_prediction/__init__.py @@ -0,0 +1,13 @@ +import logging +import sys + +# log config +LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s" +DATE_FORMAT = "%Y-%m-%d %H:%M:%S" +stream_handler = logging.StreamHandler(sys.stdout) +stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT) +stream_handler.setFormatter(stream_handler_format) + +logger = logging.getLogger(__name__) +logger.propagate = False +logger.addHandler(stream_handler) From 00b40c06328ee0410e3db191cdbbb271a474034b Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 15:45:20 +0100 Subject: [PATCH 13/16] import logger from `__init__.py` --- src/serve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/serve.py b/src/serve.py index 60b86ca..c666527 100644 --- a/src/serve.py +++ b/src/serve.py @@ -2,6 +2,7 @@ import gzip import json import logging +from image_prediction import logger from image_prediction.config import Config from image_prediction.locations import CONFIG_FILE from image_prediction.pipeline import load_pipeline @@ -15,7 +16,7 @@ PYINFRA_CONFIG = config.get_config() IMAGE_CONFIG = Config(CONFIG_FILE) # logging.getLogger().addHandler(logging.StreamHandler()) -logger = logging.getLogger(__name__) +# logger = logging.getLogger(__name__) logger.setLevel(PYINFRA_CONFIG.logging_level_root) From ee99d76aabf739da2cf62845d121694c80c02d65 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 15:51:33 +0100 Subject: [PATCH 14/16] remove commented out code --- src/serve.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/serve.py b/src/serve.py index c666527..ea12d44 100644 --- a/src/serve.py +++ b/src/serve.py @@ -15,8 +15,6 @@ from pyinfra.storage.storage import get_storage PYINFRA_CONFIG = config.get_config() IMAGE_CONFIG = Config(CONFIG_FILE) -# logging.getLogger().addHandler(logging.StreamHandler()) -# logger = logging.getLogger(__name__) logger.setLevel(PYINFRA_CONFIG.logging_level_root) From d4657f1ab164971d11f28da5f30f9bb5d321ad00 Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Wed, 15 Feb 2023 16:46:47 +0100 Subject: [PATCH 15/16] fix type --- src/serve.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/serve.py b/src/serve.py index ea12d44..e123213 100644 --- a/src/serve.py +++ b/src/serve.py @@ -29,7 +29,7 @@ def process_request(request_message): file_id = request_message["fileId"] target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}" response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}" - logger.info("Processing file %s w/ file_id=%s, and daossier_id=%s", target_file_name, file_id, dossier_id) + logger.info("Processing file %s w/ file_id=%s and dossier_id=%s", target_file_name, file_id, dossier_id) bucket = PYINFRA_CONFIG.storage_bucket storage = get_storage(PYINFRA_CONFIG) @@ -38,22 +38,22 @@ def process_request(request_message): pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) if storage.exists(bucket, target_file_name): - logger.info("fetching file for file_id=%s, and daossier_id=%s", file_id, dossier_id) + logger.info("fetching file for file_id=%s and dossier_id=%s", file_id, dossier_id) object_bytes = storage.get_object(bucket, target_file_name) object_bytes = gzip.decompress(object_bytes) classifications = list(pipeline(pdf=object_bytes)) - logger.info("predictions ready for file_id=%s, and daossier_id=%s", file_id, dossier_id) + logger.info("predictions ready for file_id=%s and dossier_id=%s", file_id, dossier_id) result = {**request_message, "data": classifications} storage_bytes = gzip.compress(json.dumps(result).encode("utf-8")) - logger.info("storing predictions for file_id=%s, and daossier_id=%s", file_id, dossier_id) + logger.info("storing predictions for file_id=%s and dossier_id=%s", file_id, dossier_id) storage.put_object(bucket, response_file_name, storage_bytes) return {"dossierId": dossier_id, "fileId": file_id} else: - logger.info("no files found for file_id=%s, and daossier_id=%s", file_id, dossier_id) + logger.info("no files found for file_id=%s and dossier_id=%s", file_id, dossier_id) return None From 846f127d3ba75c1be124ddc780a4f9c849dc84af Mon Sep 17 00:00:00 2001 From: Francisco Schulz Date: Thu, 16 Feb 2023 09:50:17 +0100 Subject: [PATCH 16/16] update reference --- incl/pyinfra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/incl/pyinfra b/incl/pyinfra index 9e139e7..c97ae3d 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 9e139e79e46c52014986f9afb2c6534281b55c10 +Subproject commit c97ae3d2c242dfc88a342955311dd488cb9a5f60