diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..81a0e9a --- /dev/null +++ b/.coveragerc @@ -0,0 +1,54 @@ +# .coveragerc to control coverage.py +[run] +branch = True +omit = + */site-packages/* + */distutils/* + */test/* + */__init__.py + */setup.py + */venv/* + */env/* + */build_venv/* + */build_env/* +source = + image_prediction + src +relative_files = True +data_file = .coverage + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: +omit = + */site-packages/* + */distutils/* + */test/* + */__init__.py + */setup.py + */venv/* + */env/* + */build_venv/* + */build_env/* + +ignore_errors = True + +[html] +directory = reports + +[xml] +output = reports/coverage.xml diff --git a/image_prediction/flask.py b/image_prediction/flask.py index 34f8a29..5cf40c2 100644 --- a/image_prediction/flask.py +++ b/image_prediction/flask.py @@ -1,3 +1,4 @@ +import multiprocessing from typing import Callable from flask import Flask, request, jsonify @@ -25,11 +26,32 @@ def make_prediction_server(predict_fn: Callable): @app.route("/", methods=["POST"]) def predict(): - pdf = request.data + def predict_fn_wrapper(pdf, return_dict): + return_dict["result"] = predict_fn(pdf) + + def process(): + # Tensorflow does not free RAM. Workaround is running model in process. + # https://stackoverflow.com/questions/39758094/clearing-tensorflow-gpu-memory-after-model-execution + pdf = request.data + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=predict_fn_wrapper, + args=( + pdf, + return_dict, + ), + ) + p.start() + p.join() + try: + return dict(return_dict)["result"] + except KeyError: + raise logger.debug("Running predictor on document...") try: - predictions = predict_fn(pdf) + predictions = process() response = jsonify(predictions) logger.info("Analysis completed.") return response diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5922a79 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +norecursedirs = incl \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d20ead3..217a846 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,5 @@ PDFNetPython3~=9.1.0 Pillow~=8.3.2 PyYAML~=5.4.1 scikit_learn~=0.24.2 + +pytest~=7.1.0 \ No newline at end of file diff --git a/scripts/keras_MnWE.py b/scripts/keras_MnWE.py new file mode 100644 index 0000000..05a45dd --- /dev/null +++ b/scripts/keras_MnWE.py @@ -0,0 +1,58 @@ +import multiprocessing + +import numpy as np +from tensorflow import keras +from tensorflow.keras import layers + + +def process(predict_fn_wrapper): + # We observed memory doesn't get properly deallocated unless we do this: + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=predict_fn_wrapper, + args=(return_dict,), + ) + p.start() + p.join() + try: + return dict(return_dict)["result"] + except KeyError: + pass + + +def make_model(): + inputs = keras.Input(shape=(784,)) + dense = layers.Dense(64, activation="relu") + x = dense(inputs) + outputs = layers.Dense(10)(x) + model = keras.Model(inputs=inputs, outputs=outputs, name="mnist_model") + model.compile( + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=keras.optimizers.RMSprop(), + metrics=["accuracy"], + ) + return model + + +def make_predict_fn(): + # Keras bug: doesn't work in outer scope + model = make_model() + + def predict(*args): + # model = make_model() + return model.predict(np.random.random(size=(1, 784))) + + return predict + + +def make_predict_fn_wrapper(predict_fn): + def predict_fn_wrapper(return_dict): + return_dict["result"] = predict_fn() + + return predict_fn_wrapper + + +if __name__ == "__main__": + predict_fn = make_predict_fn() + print(process(make_predict_fn_wrapper(predict_fn))) diff --git a/src/serve.py b/src/serve.py index f44b632..666ca80 100644 --- a/src/serve.py +++ b/src/serve.py @@ -12,13 +12,14 @@ logger = get_logger() def main(): - def predict(pdf): + # Keras model.predict stalls when model was loaded in different process + # https://stackoverflow.com/questions/42504669/keras-tensorflow-and-multiprocessing-in-python + predictor = Predictor() predictions, metadata = predictor.predict_pdf(pdf, verbose=CONFIG.service.progressbar) response = build_response(predictions, metadata) return response - predictor = Predictor() logger.info("Predictor ready.") prediction_server = make_prediction_server(predict) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..71b37d1 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,70 @@ +import os.path + +import pytest + +from image_prediction.predictor import Predictor + + +@pytest.fixture +def predictions(): + return [ + { + "class": "signature", + "probabilities": { + "signature": 1.0, + "logo": 9.150285377746546e-19, + "other": 4.374506412383356e-19, + "formula": 3.582569597002796e-24, + }, + } + ] + + +@pytest.fixture +def metadata(): + return [ + { + "page_height": 612.0, + "page_width": 792.0, + "height": 61.049999999999955, + "width": 139.35000000000002, + "page_idx": 8, + "x1": 63.5, + "x2": 202.85000000000002, + "y1": 472.0, + "y2": 533.05, + } + ] + + +@pytest.fixture +def response(): + return [ + { + "classification": { + "label": "signature", + "probabilities": {"formula": 0.0, "logo": 0.0, "other": 0.0, "signature": 1.0}, + }, + "filters": { + "allPassed": True, + "geometry": { + "imageFormat": {"quotient": 2.282555282555285, "tooTall": False, "tooWide": False}, + "imageSize": {"quotient": 0.13248234868245012, "tooLarge": False, "tooSmall": False}, + }, + "probability": {"unconfident": False}, + }, + "geometry": {"height": 61.049999999999955, "width": 139.35000000000002}, + "position": {"pageNumber": 9, "x1": 63.5, "x2": 202.85000000000002, "y1": 472.0, "y2": 533.05}, + } + ] + + +@pytest.fixture +def predictor(): + return Predictor() + + +@pytest.fixture +def test_pdf(): + with open("./test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf", "rb") as f: + return f.read() diff --git a/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf b/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf new file mode 100644 index 0000000..41f0d70 Binary files /dev/null and b/test/test_data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf differ diff --git a/test/unit_tests/__init__.py b/test/unit_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unit_tests/test_predictor.py b/test/unit_tests/test_predictor.py new file mode 100644 index 0000000..0da6f91 --- /dev/null +++ b/test/unit_tests/test_predictor.py @@ -0,0 +1,26 @@ +def test_predict_pdf_works(predictor, test_pdf): + # FIXME ugly test since there are '\n's in the dict with unknown heritage + predictions, metadata = predictor.predict_pdf(test_pdf) + predictions = [p for p in predictions][0] + assert predictions["class"] == "formula" + probabilities = predictions["probabilities"] + # Floating point precision problem for output so test only that keys exist not the values + assert all(key in probabilities for key in ("formula", "other", "signature", "logo")) + metadata = list(metadata) + metadata = dict(**metadata[0]) + metadata.pop("document_filename") # temp filename cannot be tested + assert metadata == { + "px_width": 389.0, + "px_height": 389.0, + "width": 194.49999000000003, + "height": 194.49998999999997, + "x1": 320.861, + "x2": 515.36099, + "y1": 347.699, + "y2": 542.19899, + "page_width": 595.2800000000001, + "page_height": 841.89, + "page_rotation": 0, + "page_idx": 1, + "n_pages": 3, + } diff --git a/test/unit_tests/test_response.py b/test/unit_tests/test_response.py new file mode 100644 index 0000000..696c92b --- /dev/null +++ b/test/unit_tests/test_response.py @@ -0,0 +1,5 @@ +from image_prediction.response import build_response + + +def test_build_response_returns_valid_response(predictions, metadata, response): + assert build_response(predictions, metadata) == response