reimplemented model loader logic and moved base weights into mlflow run dir

This commit is contained in:
Matthias Bisping 2022-03-29 19:50:43 +02:00
parent edbc5c3f84
commit 06adedac57
9 changed files with 153 additions and 183 deletions

View File

@ -1,5 +1,6 @@
[core]
remote = vector
autostage = true
['remote "vector"']
url = ssh://vector.iqser.com/research/image_service/
url = ssh://vector.iqser.com/research/image-prediction/
port = 22

2
.gitignore vendored
View File

@ -172,4 +172,4 @@ fabric.properties
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
/image_prediction/data/mlruns/
/data/mlruns/
#/data/mlruns/

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/mlruns

View File

@ -1,4 +0,0 @@
outs:
- md5: 6d0186c1f25e889d531788f168fa6cf0
size: 16727296
path: base_weights.h5

View File

@ -1,5 +1,5 @@
outs:
- md5: d1c708270bab6fcd344d4a8b05d1103d.dir
size: 150225383
nfiles: 178
- md5: 4219c52caf5f87f5a94f1ae00c60fb91.dir
size: 166952679
nfiles: 179
path: mlruns

View File

@ -1,123 +1,110 @@
# """This module translates between the new ModelLoader API and the inconsistent and historically grown redai model and
# MLflow API as well as the circumstance, that the model artifacts are currently not stored at a single place, due to the
# need of loading the base weights of the pre-trained model, that became apparent at a later point than the design of the
# MLflow storage and MlflowModelReader class; that is why the code in this module is so unclean. In the future, a
# non-adhoc solution should be used that offers a clean API and storage solution. Either implement a well-designed MLflow
# based solution or look into an alternative such as WandB or use a platform solution such as AWS.
# """
# import importlib
# import json
# import os
# import warnings
# from typing import Mapping
#
# import numpy as np
# from funcy import rcompose
#
# from image_prediction.exceptions import IncorrectInstantiation
# from image_prediction.model_loader.loader import ModelLoader
#
# warnings.filterwarnings("ignore", category=DeprecationWarning, module="pkg_resources")
#
# import mlflow
#
#
# def load_object(object_path):
# path_fragments = object_path.split(".")
#
# module_path = ".".join(path_fragments[:-1])
# object_name = path_fragments[-1]
#
# module = importlib.import_module(module_path)
# return getattr(module, object_name)
#
#
# def to_local_path(uri):
# return uri[7:]
#
#
# class MlflowModelReader:
#
# def __init__(self, run_id, mlruns_dir=None):
# mlflow.set_tracking_uri(mlruns_dir)
#
# self.run_id = run_id
# self.run = mlflow.get_run(run_id)
# self.artifact_uri = self.__correct_artifact_uri(self.run.info.to_proto().artifact_uri, mlruns_dir)
#
# @staticmethod
# def __correct_artifact_uri(run_artifact_uri, base_path):
# _, suffix = run_artifact_uri.split("mlruns/")
# return os.path.join(base_path, suffix)
#
# def get_weights_path(self, prefix="tt"):
# path = os.path.join(self.artifact_uri, prefix, "train_dev", "estimator", "weights.h5")
# return path
#
# def get_classes(self, prefix="tt"):
# classes = json.loads(
# self.run.data.params[os.path.join(prefix, "train_dev/estimator/classes")].replace("'", '"')
# )
# return classes
#
# def get_model_handle(self, base_weights=None):
# weights_path = self.get_weights_path()
# model_handle_builder = load_object(self.run.data.params["model_handle_builder"].strip())
# model_handle = model_handle_builder(self.get_classes(), base_weights=base_weights)
# model_handle.load_top_weights(weights_path)
# return model_handle
#
#
# class PredictionModelHandle:
# """Simplifies usage of ModelHandle instances for prediction purposes."""
#
# def __init__(self, model_handle, classes_readable: Mapping[int, str]):
# self.__model_handle = model_handle
# self.__classes_readable = classes_readable
#
# @property
# def classes(self):
# return self.__classes_readable
#
# def predict(self, *args, **kwargs):
# predict = rcompose(self.__model_handle.prep_images, self.__model_handle.model.predict)
# return predict(*args, **kwargs)
#
# def predict_proba(self, *args, **kwargs):
# predict = rcompose(self.__model_handle.prep_images, self.__model_handle.model.predict_proba)
# return predict(*args, **kwargs)
#
#
# class MlflowLoader(ModelLoader):
#
# def __init__(self, mlruns_dir):
# self.__mlruns_dir = mlruns_dir
# self._base_weights = None
#
# def load_model(self, run_id, base_weights=None) -> PredictionModelHandle:
#
# # TODO: refac https://stackoverflow.com/questions/42735421/how-to-restrict-object-instantiation-only-via-a-factory-in-python
# if not base_weights:
#
# if not self._base_weights:
# raise IncorrectInstantiation("MlflowReader needs to be initialized via get_model_loader.")
#
# base_weights = self._base_weights
#
# mlflow_reader = MlflowModelReader(run_id, mlruns_dir=self.__mlruns_dir)
# model_handel = mlflow_reader.get_model_handle(base_weights)
# model_handle = model_handel
# classes_readable = self.__load_classes(model_handle)
#
# model = PredictionModelHandle(model_handle, classes_readable)
#
# return model
#
# @staticmethod
# def __load_classes(model_handle):
#
# classes = model_handle.model.classes_
# classes_readable = np.array(model_handle.classes)
# classes_readable_aligned = classes_readable[classes[list(range(len(classes)))]]
#
# return classes_readable_aligned
"""This module translates between the new ModelLoader API and the inconsistent and historically grown redai model and
MLflow API as well as the circumstance, that the model artifacts are currently not stored at a single place, due to the
need of loading the base weights of the pre-trained model, that became apparent at a later point than the design of the
MLflow storage and MlflowModelReader class; that is why the code in this module is so unclean. In the future, a
non-adhoc solution should be used that offers a clean API and storage solution. Either implement a well-designed MLflow
based solution or look into an alternative such as WandB or use a platform solution such as AWS.
"""
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import importlib
import json
import os
from functools import lru_cache
from funcy import rcompose
from image_prediction.model_loader.database.connector import DatabaseConnector
import mlflow
class PredictionModelHandle:
"""Simplifies usage of ModelHandle instances for prediction purposes."""
def __init__(self, model_handle):
self.__model_handle = model_handle
def predict(self, *args, **kwargs):
predict = rcompose(self.__model_handle.prep_images, self.__model_handle.model.predict)
return predict(*args, **kwargs)
def predict_proba(self, *args, **kwargs):
predict = rcompose(self.__model_handle.prep_images, self.__model_handle.model.predict_proba)
return predict(*args, **kwargs)
class MlflowModelReader:
def __init__(self, mlruns_dir=None):
self.mlruns_dir = mlruns_dir
mlflow.set_tracking_uri(self.mlruns_dir)
@staticmethod
def __correct_artifact_uri(run_artifact_uri, base_path):
_, suffix = run_artifact_uri.split("mlruns/")
return os.path.join(base_path, suffix)
def __get_weights_path(self, run_id, prefix="tt"):
run = self.__get_run(run_id)
artifact_uri = self.__correct_artifact_uri(run.info.to_proto().artifact_uri, self.mlruns_dir)
path = os.path.join(artifact_uri, prefix, "train_dev", "estimator")
base_path = os.path.join(path, "base_weights.h5")
weights_path = os.path.join(path, "weights.h5")
return base_path, weights_path
@lru_cache(maxsize=None)
def __get_run(self, run_id):
return mlflow.get_run(run_id)
def __get_classes(self, run_id, prefix="tt"):
run = self.__get_run(run_id)
classes = json.loads(run.data.params[os.path.join(prefix, "train_dev/estimator/classes")].replace("'", '"'))
return classes
def __get_model_handle(self, run_id):
run = self.__get_run(run_id)
model_handle_builder = load_object(run.data.params["model_handle_builder"].strip())
base_weights_path, weights_path = self.__get_weights_path(run_id)
model_handle = model_handle_builder(self.__get_classes(run_id), base_weights=base_weights_path)
model_handle.load_top_weights(weights_path)
return model_handle
def __get_model(self, run_id) -> PredictionModelHandle:
model_handle = self.__get_model_handle(run_id)
model = PredictionModelHandle(model_handle)
return model
def __getitem__(self, run_id):
return {"model": self.__get_model(run_id), "classes": self.__get_classes(run_id)}
def load_object(object_path):
path_fragments = object_path.split(".")
module_path = ".".join(path_fragments[:-1])
object_name = path_fragments[-1]
module = importlib.import_module(module_path)
return getattr(module, object_name)
class MlflowConnector(DatabaseConnector):
def __init__(self, mlflow_reader: MlflowModelReader):
self.mlflow_reader = mlflow_reader
def get_object(self, run_id):
return self.mlflow_reader[run_id]

View File

@ -1,2 +1,4 @@
[pytest]
norecursedirs = incl
norecursedirs = incl
filterwarnings =
ignore:.*imp.*:DeprecationWarning

View File

@ -20,6 +20,7 @@ from image_prediction.image_extractor.extractors.mock import ImageExtractorMock
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
from image_prediction.model_loader.database.connectors.mock import DatabaseConnectorMock
from image_prediction.model_loader.loader import ModelLoader
from image_prediction.model_loader.loaders.mlflow import MlflowConnector, MlflowModelReader
@pytest.fixture
@ -65,10 +66,6 @@ def estimator_adapter(estimator_type, keras_model, output_batch_generator, monke
@pytest.fixture
def keras_model(input_size):
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@ -173,7 +170,6 @@ def image_metadata_pairs(images, metadata):
@pytest.fixture
def pdf(image_metadata_pairs):
pdf = fpdf.FPDF(unit="pt")
for pair in image_metadata_pairs:
@ -183,7 +179,6 @@ def pdf(image_metadata_pairs):
def add_image(pdf, image_metadata_pair):
while fewer_pages_then_required(image_metadata_pair.metadata["page_idx"], pdf):
pdf.add_page()
@ -207,29 +202,10 @@ def add_image_to_last_page(pdf: fpdf.fpdf.FPDF, image_metadata_pair):
pdf.image(temp_image.name, x=x, y=y, w=w, h=h, type="png")
# @pytest.fixture
# def model_handle_mock(classes, classifier):
#
# class ModelHandleMock:
#
# def __init__(self, classes):
# classifier.classes_ = np.array(list(range(len(classes))))
# self.classes = classes
# self.model = classifier
#
# return ModelHandleMock(classes)
#
#
# @pytest.fixture
# def prediction_model_handle_mock(model_handle_mock, classes):
# return PredictionModelHandle(model_handle_mock, classes)
@pytest.fixture
def model():
class Model:
@staticmethod
def predict(*args):
return True
@ -239,7 +215,7 @@ def model():
return True
return Model()
@pytest.fixture
def model_database_record_identifier():
@ -257,9 +233,13 @@ def model_database(model_database_record, model_database_record_identifier):
@pytest.fixture
def database_connector(database_type, model_database):
def database_connector(database_type, model_database, mlflow_reader):
if database_type == "mock":
return DatabaseConnectorMock(model_database)
elif database_type == "mlflow":
return MlflowConnector(mlflow_reader)
else:
raise UnknownDatabaseType(f"No connector for database type {database_type} was specified.")
@ -269,17 +249,18 @@ def model_loader(database_connector):
return ModelLoader(database_connector)
# @pytest.fixture
# def model_loader(loader_type, monkeypatch, model_handle_mock, classes):
# if loader_type == "mock":
# loader = ModelLoaderMock()
# monkeypatch.setattr(loader, "model", model_handle_mock)
#
# # elif loader_type == "mlflow":
# # loader = get_mlflow_loader()
# # monkeypatch.setattr(loader, "_model_handle", model_handle_mock)
#
# else:
# raise UnknownModelLoader(f"No model loader for type {loader_type} was specified.")
#
# return loader
@pytest.fixture
def mlflow_run_id():
from image_prediction.config import CONFIG
return CONFIG.service.run_id
@pytest.fixture
def mlruns_dir():
from image_prediction.locations import MLRUNS_DIR
return MLRUNS_DIR
@pytest.fixture
def mlflow_reader(mlruns_dir):
return MlflowModelReader(mlruns_dir)

View File

@ -1,14 +1,7 @@
import numpy as np
import pytest
from image_prediction.model_loader.loaders.mlflow import PredictionModelHandle
# @pytest.mark.parametrize("loader_type", ["mock"])
# @pytest.mark.parametrize("estimator_type", ["mock"])
# @pytest.mark.parametrize("batch_size", [3])
# def test_load_model_and_classes(model_loader, model_handle_mock, classes):
# model_loaded, classes_loaded = model_loader.load_model_and_classes("an identifier")
# assert model_loaded == model_handle_mock
# assert np.all(classes_loaded == classes)
@pytest.mark.parametrize("database_type", ["mock"])
def test_load_model_and_classes(model_loader, model_database_record_identifier, model, classes):
@ -17,3 +10,12 @@ def test_load_model_and_classes(model_loader, model_database_record_identifier,
assert model_loaded == model
assert classes_loaded == classes
@pytest.mark.parametrize("database_type", ["mlflow"])
def test_load_model_and_classes_from_mlflow_store(model_loader, mlflow_run_id):
model_loaded = model_loader.load_model(mlflow_run_id)
classes_loaded = model_loader.load_classes(mlflow_run_id)
assert type(model_loaded) == PredictionModelHandle
assert classes_loaded == ['formula', 'logo', 'other', 'signature']