From 684aca364ffc4f1d2ad5ec37b3f7622de955130f Mon Sep 17 00:00:00 2001 From: cdietrich Date: Tue, 1 Mar 2022 14:17:37 +0100 Subject: [PATCH 1/5] adapt service-container to image-service-v2 --- .dvc/.gitignore | 1 + .dvc/config | 3 +- .gitmodules | 6 +- Dockerfile | 8 +- README.md | 4 +- .../src/main/java/buildjob/PlanSpec.java | 14 +- data/.gitignore | 1 - data/checkpoint.pth.dvc | 4 - data/hub/checkpoints/.gitignore | 1 - .../hub/checkpoints/resnet50-0676ba61.pth.dvc | 4 - fb_detr/predictor.py | 162 ------------------ {fb_detr => image_prediction}/__init__.py | 0 {fb_detr => image_prediction}/config.py | 2 +- {fb_detr => image_prediction}/locations.py | 0 image_prediction/predictor.py | 89 ++++++++++ .../utils/__init__.py | 0 .../utils/estimator.py | 6 +- .../utils/non_max_supprs.py | 0 {fb_detr => image_prediction}/utils/stream.py | 0 incl/detr | 1 - requirements.txt | 14 +- scripts/flask_test.py | 2 +- setup.py | 4 +- setup/docker.sh | 10 +- sonar-project.properties | 2 +- src/serve.py | 4 +- 26 files changed, 128 insertions(+), 214 deletions(-) delete mode 100644 data/.gitignore delete mode 100644 data/checkpoint.pth.dvc delete mode 100644 data/hub/checkpoints/.gitignore delete mode 100644 data/hub/checkpoints/resnet50-0676ba61.pth.dvc delete mode 100644 fb_detr/predictor.py rename {fb_detr => image_prediction}/__init__.py (100%) rename {fb_detr => image_prediction}/config.py (94%) rename {fb_detr => image_prediction}/locations.py (100%) create mode 100644 image_prediction/predictor.py rename {fb_detr => image_prediction}/utils/__init__.py (100%) rename {fb_detr => image_prediction}/utils/estimator.py (80%) rename {fb_detr => image_prediction}/utils/non_max_supprs.py (100%) rename {fb_detr => image_prediction}/utils/stream.py (100%) delete mode 160000 incl/detr diff --git a/.dvc/.gitignore b/.dvc/.gitignore index 528f30c..0468ef1 100644 --- a/.dvc/.gitignore +++ b/.dvc/.gitignore @@ -1,3 +1,4 @@ /config.local /tmp /cache +/plots/ diff --git a/.dvc/config b/.dvc/config index bde583a..9277694 100644 --- a/.dvc/config +++ b/.dvc/config @@ -1,6 +1,5 @@ [core] remote = vector - autostage = true ['remote "vector"'] - url = ssh://vector.iqser.com/research/detr_server/ + url = ssh://vector.iqser.com/research/image_service/ port = 22 diff --git a/.gitmodules b/.gitmodules index 6ea2203..91435b0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "incl/detr"] - path = incl/detr - url = ssh://git@git.iqser.com:2222/rr/detr.git +[submodule "incl/redai_image"] + path = incl/redai_image + url = ssh://git@git.iqser.com:2222/rr/redai_image.git \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 50a495f..7e336d2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,13 @@ ARG BASE_ROOT="nexus.iqser.com:5001/red/" ARG VERSION_TAG="latest" -FROM ${BASE_ROOT}fb-detr-base:${VERSION_TAG} +FROM ${BASE_ROOT}image-prediction-base:${VERSION_TAG} WORKDIR /app/service COPY ./src ./src -COPY ./incl/detr ./incl/detr -COPY ./fb_detr ./fb_detr +COPY ./incl/redai_image ./incl/redai_image +COPY image_prediction ./image_prediction COPY ./setup.py ./setup.py COPY ./requirements.txt ./requirements.txt COPY ./config.yaml ./config.yaml @@ -17,7 +17,7 @@ RUN python3 -m pip install -r requirements.txt RUN python3 -m pip install -e . -WORKDIR /app/service/incl/detr +WORKDIR /app/service/incl/redai_image RUN python3 -m pip install -e . WORKDIR /app/service diff --git a/README.md b/README.md index 17a0500..41e2f56 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ setup/docker.sh Build head image ```bash -docker build -f Dockerfile -t detr-server . --build-arg BASE_ROOT="" +docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT="" ``` ### Usage @@ -15,7 +15,7 @@ docker build -f Dockerfile -t detr-server . --build-arg BASE_ROOT="" Shell 1 ```bash -docker run --rm --net=host --rm detr-server +docker run --rm --net=host --rm image-prediction ``` Shell 2 diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index 090be4d..adba0f2 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -33,8 +33,8 @@ import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location; @BambooSpec public class PlanSpec { - private static final String SERVICE_NAME = "fb-detr"; - private static final String SERVICE_NAME_BASE = "fb-detr-base"; + private static final String SERVICE_NAME = "image-prediction"; + private static final String SERVICE_NAME_BASE = "image-prediction-base"; private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_",""); @@ -72,7 +72,7 @@ public class PlanSpec { return new Plan( project(), SERVICE_NAME, new BambooKey(SERVICE_KEY)) - .description("Docker build for fb-detr.") + .description("Docker build for image-prediction.") // .variables() .stages(new Stage("Build Stage") .jobs( @@ -86,7 +86,7 @@ public class PlanSpec { .checkoutItems(new CheckoutItem().defaultRepository()), new VcsCheckoutTask() .description("Checkout detr research repository.") - .checkoutItems(new CheckoutItem().repository("RR / DETR").path("DETR")), + .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")), new ScriptTask() .description("Set config and keys.") .inlineBody("mkdir -p ~/.ssh\n" + @@ -112,8 +112,8 @@ public class PlanSpec { .description("Checkout default repository.") .checkoutItems(new CheckoutItem().defaultRepository()), new VcsCheckoutTask() - .description("Checkout detr research repository.") - .checkoutItems(new CheckoutItem().repository("RR / DETR").path("DETR")), + .description("Checkout redai_image repository.") + .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")), new ScriptTask() .description("Set config and keys.") .inlineBody("mkdir -p ~/.ssh\n" + @@ -174,7 +174,7 @@ public class PlanSpec { .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml") .volume("/var/run/docker.sock", "/var/run/docker.sock")))) .linkedRepositories("RR / " + SERVICE_NAME) - .linkedRepositories("RR / DETR") + .linkedRepositories("RR / redai_image") .triggers(new BitbucketServerTrigger()) .planBranchManagement(new PlanBranchManagement() .createForVcsBranch() diff --git a/data/.gitignore b/data/.gitignore deleted file mode 100644 index 65ac288..0000000 --- a/data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/checkpoint.pth diff --git a/data/checkpoint.pth.dvc b/data/checkpoint.pth.dvc deleted file mode 100644 index 7707825..0000000 --- a/data/checkpoint.pth.dvc +++ /dev/null @@ -1,4 +0,0 @@ -outs: -- md5: 9face65530febd41a0722e0513da2264 - size: 496696129 - path: checkpoint.pth diff --git a/data/hub/checkpoints/.gitignore b/data/hub/checkpoints/.gitignore deleted file mode 100644 index 17c6958..0000000 --- a/data/hub/checkpoints/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/resnet50-0676ba61.pth diff --git a/data/hub/checkpoints/resnet50-0676ba61.pth.dvc b/data/hub/checkpoints/resnet50-0676ba61.pth.dvc deleted file mode 100644 index 1110d26..0000000 --- a/data/hub/checkpoints/resnet50-0676ba61.pth.dvc +++ /dev/null @@ -1,4 +0,0 @@ -outs: -- md5: b94941323912291bb67db6fdb1d80c11 - size: 102530333 - path: resnet50-0676ba61.pth diff --git a/fb_detr/predictor.py b/fb_detr/predictor.py deleted file mode 100644 index e083dbf..0000000 --- a/fb_detr/predictor.py +++ /dev/null @@ -1,162 +0,0 @@ -import argparse -import logging -from itertools import compress, starmap, chain -from operator import itemgetter -from pathlib import Path -from typing import Iterable - -import torch -from iteration_utilities import starfilter -from tqdm import tqdm - -from detr.models import build_model -from detr.prediction import get_args_parser, infer -from fb_detr.config import CONFIG -from fb_detr.utils.non_max_supprs import greedy_non_max_supprs -from fb_detr.utils.stream import stream_pages, chunk_iterable, get_page_count - - -def load_model(checkpoint_path): - - parser = argparse.ArgumentParser(parents=[get_args_parser()]) - args = parser.parse_args() - - if args.output_dir: - Path(args.output_dir).mkdir(parents=True, exist_ok=True) - - device = torch.device(CONFIG.estimator.device) - - model, _, _ = build_model(args) - - checkpoint = torch.load(checkpoint_path, map_location="cpu") - model.load_state_dict(checkpoint["model"]) - - model.to(device) - - return model - - -class Predictor: - def __init__(self, checkpoint_path, classes=None, rejection_class=None): - self.model = load_model(checkpoint_path) - self.classes = classes - self.rejection_class = rejection_class - - @staticmethod - def __format_boxes(boxes): - - keys = "x1", "y1", "x2", "y2" - - x1s = boxes[:, 0].tolist() - y1s = boxes[:, 1].tolist() - x2s = boxes[:, 2].tolist() - y2s = boxes[:, 3].tolist() - - boxes = [dict(zip(keys, vs)) for vs in zip(x1s, y1s, x2s, y2s)] - - return boxes - - @staticmethod - def __normalize_to_list(maybe_multiple): - return maybe_multiple if isinstance(maybe_multiple, tuple) else tuple([maybe_multiple]) - - def __format_classes(self, classes): - if self.classes: - return self.__normalize_to_list(itemgetter(*classes.tolist())(self.classes)) - else: - return classes.tolist() - - @staticmethod - def __format_probas(probas): - return probas.max(axis=1).tolist() - - def __format_prediction(self, predictions: dict): - - boxes, classes, probas = itemgetter("bboxes", "classes", "probas")(predictions) - - if len(boxes): - boxes = self.__format_boxes(boxes) - classes = self.__format_classes(classes) - probas = self.__format_probas(probas) - else: - boxes, classes, probas = [], [], [] - - predictions["bboxes"] = boxes - predictions["classes"] = classes - predictions["probas"] = probas - - return predictions - - def __filter_predictions_for_image(self, predictions): - - boxes, classes, probas = itemgetter("bboxes", "classes", "probas")(predictions) - - if boxes: - keep = map(lambda c: c != self.rejection_class, classes) - compressed = list(compress(zip(boxes, classes, probas), keep)) - boxes, classes, probas = map(list, zip(*compressed)) if compressed else ([], [], []) - predictions["bboxes"] = boxes - predictions["classes"] = classes - predictions["probas"] = probas - - return predictions - - def filter_predictions(self, predictions): - def detections_present(_, prediction): - return bool(prediction["classes"]) - - # TODO: set page_idx even when not filtering - def build_return_dict(page_idx, predictions): - return {"page_idx": page_idx, **predictions} - - filtered_rejections = map(self.__filter_predictions_for_image, predictions) - filtered_no_detections = starfilter(detections_present, enumerate(filtered_rejections)) - filtered_no_detections = starmap(build_return_dict, filtered_no_detections) - - return filtered_no_detections - - def format_predictions(self, outputs: Iterable): - return map(self.__format_prediction, outputs) - - def __non_max_supprs(self, predictions): - predictions = map(greedy_non_max_supprs, predictions) - return predictions - - def predict(self, images, threshold=None): - - if not threshold: - threshold = CONFIG.estimator.threshold - - predictions = infer(images, self.model, CONFIG.estimator.device, threshold) - predictions = self.format_predictions(predictions) - if self.rejection_class: - predictions = self.filter_predictions(predictions) - - predictions = self.__non_max_supprs(predictions) - - predictions = list(predictions) - - return predictions - - def predict_pdf(self, pdf: bytes): - def progress(generator): - - page_count = get_page_count(pdf) - batch_count = int(page_count / CONFIG.service.batch_size) - - yield from tqdm( - generator, total=batch_count, position=1, leave=True - ) if CONFIG.service.verbose else generator - - def predict_batch(batch_idx, batch): - predictions = self.predict(batch) - for p in predictions: - p["page_idx"] += batch_idx - - return predictions - - page_stream = stream_pages(pdf) - page_batches = chunk_iterable(page_stream, CONFIG.service.batch_size) - predictions = list(chain(*starmap(predict_batch, progress(enumerate(page_batches))))) - - return predictions diff --git a/fb_detr/__init__.py b/image_prediction/__init__.py similarity index 100% rename from fb_detr/__init__.py rename to image_prediction/__init__.py diff --git a/fb_detr/config.py b/image_prediction/config.py similarity index 94% rename from fb_detr/config.py rename to image_prediction/config.py index 49dc564..f37658f 100644 --- a/fb_detr/config.py +++ b/image_prediction/config.py @@ -3,7 +3,7 @@ from envyaml import EnvYAML -from fb_detr.locations import CONFIG_FILE +from image_prediction.locations import CONFIG_FILE def _get_item_and_maybe_make_dotindexable(container, item): diff --git a/fb_detr/locations.py b/image_prediction/locations.py similarity index 100% rename from fb_detr/locations.py rename to image_prediction/locations.py diff --git a/image_prediction/predictor.py b/image_prediction/predictor.py new file mode 100644 index 0000000..8320583 --- /dev/null +++ b/image_prediction/predictor.py @@ -0,0 +1,89 @@ +import logging +from operator import itemgetter + +from image_prediction.config import CONFIG + + +class Predictor: + """`ModelHandle` wrapper. Forwards to wrapped model handle for prediction and produces structured output that is + interpretable independently of the wrapped model (e.g. with regard to a .classes_ attribute). + """ + + def __init__(self, model_handle: ModelHandle = None): + """Initializes a ServiceEstimator. + + Args: + model_handle: ModelHandle object to forward to for prediction. By default, a model handle is loaded from the + mlflow database via CONFIG.service.run_id. + """ + try: + if model_handle is None: + reader = MlflowModelReader( + run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR + ) + # message_queue.put(text="Loading model...", level=logging.DEBUG) + self.model_handle = reader.get_model_handle(BASE_WEIGHTS) + # message_queue.put(text="Model loaded.", level=logging.DEBUG) + else: + self.model_handle = model_handle + + self.classes = self.model_handle.model.classes_ + self.classes_readable = np.array(self.model_handle.classes) + self.classes_readable_aligned = self.classes_readable[self.classes[list(range(len(self.classes)))]] + except Exception as e: + message_queue.put( + text="Service estimator initialization failed.", + exception=e, + level=logging.CRITICAL, + trace=traceback.format_exc(), + ) + + def __make_predictions_human_readable(self, probs: np.ndarray) -> List[Dict[str, float]]: + """Translates an n x m matrix of probabilities over classes into an n-element list of mappings from classes to + probabilities. + + Args: + probs: probability matrix (items x classes) + + Returns: + list of mappings from classes to probabilities. + """ + classes = np.argmax(probs, axis=1) + classes = self.classes[classes] + classes_readable = [self.model_handle.classes[c] for c in classes] + return classes_readable + + def predict(self, images: List, probabilities: bool = False, **kwargs): + """Gathers predictions for list of images. Assigns each image a class and optionally a probability distribution + over all classes. + + Args: + images (List[PIL.Image]) : Images to gather predictions for. + probabilities: Whether to return dictionaries of the following form instead of strings: + { + "class": predicted class, + "probabilities": { + "class 1" : class 1 probability, + "class 2" : class 2 probability, + ... + } + } + + Returns: + By default the return value is a list of classes (meaningful class name strings). Alternatively a list of + dictionaries with an additional probability field for estimated class probabilities per image can be + returned. + """ + X = self.model_handle.prep_images(list(images)) + + probs_per_item = self.model_handle.model.predict_proba(X, **kwargs).astype(float) + classes = self.__make_predictions_human_readable(probs_per_item) + + class2prob_per_item = [dict(zip(self.classes_readable_aligned, probs)) for probs in probs_per_item] + class2prob_per_item = [ + dict(sorted(c2p.items(), key=itemgetter(1), reverse=True)) for c2p in class2prob_per_item + ] + + predictions = [{"class": c, "probabilities": c2p} for c, c2p in zip(classes, class2prob_per_item)] + + return predictions if probabilities else classes diff --git a/fb_detr/utils/__init__.py b/image_prediction/utils/__init__.py similarity index 100% rename from fb_detr/utils/__init__.py rename to image_prediction/utils/__init__.py diff --git a/fb_detr/utils/estimator.py b/image_prediction/utils/estimator.py similarity index 80% rename from fb_detr/utils/estimator.py rename to image_prediction/utils/estimator.py index 04c6922..113f02a 100644 --- a/fb_detr/utils/estimator.py +++ b/image_prediction/utils/estimator.py @@ -1,8 +1,8 @@ import os -from fb_detr.config import CONFIG -from fb_detr.locations import DATA_DIR, TORCH_HOME -from fb_detr.predictor import Predictor +from image_prediction.config import CONFIG +from image_prediction.locations import DATA_DIR, TORCH_HOME +from image_prediction.predictor import Predictor def suppress_userwarnings(): diff --git a/fb_detr/utils/non_max_supprs.py b/image_prediction/utils/non_max_supprs.py similarity index 100% rename from fb_detr/utils/non_max_supprs.py rename to image_prediction/utils/non_max_supprs.py diff --git a/fb_detr/utils/stream.py b/image_prediction/utils/stream.py similarity index 100% rename from fb_detr/utils/stream.py rename to image_prediction/utils/stream.py diff --git a/incl/detr b/incl/detr deleted file mode 160000 index 7720238..0000000 --- a/incl/detr +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 772023801e4fd3deef7953f7f49fd6fb2bf60236 diff --git a/requirements.txt b/requirements.txt index d327140..2d19464 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,3 @@ -torch==1.10.2 -numpy==1.22.1 -opencv-python-headless==4.5.5.62 -torchvision==0.11.3 -pycocotools==2.0.4 -scipy==1.7.3 -pdf2image==1.16.0 Flask==2.0.2 requests==2.27.1 iteration-utilities==0.11.0 @@ -12,5 +5,10 @@ dvc==2.9.3 dvc[ssh] frozendict==2.3.0 waitress==2.0.0 -envyaml==1.10.211231 +envyaml~=1.8.210417 dependency-check==0.6.* +envyaml~=1.8.210417 +mlflow~=1.20.2 +numpy~=1.19.3 +PDFNetPython3~=9.1.0 +tqdm~=4.62.2 diff --git a/scripts/flask_test.py b/scripts/flask_test.py index 2934f11..ba95c12 100644 --- a/scripts/flask_test.py +++ b/scripts/flask_test.py @@ -9,7 +9,7 @@ app = Flask(__name__) @app.before_first_request def init(): - from fb_detr.predictor import Predictor + from image_prediction.predictor import Predictor global PRED diff --git a/setup.py b/setup.py index 21d77de..53742f4 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,11 @@ from distutils.core import setup setup( - name="fb_detr", + name="image_prediction", version="0.1.0", description="", author="", author_email="", url="", - packages=["fb_detr"], + packages=["image_prediction"], ) diff --git a/setup/docker.sh b/setup/docker.sh index ae86dfd..b8cf880 100755 --- a/setup/docker.sh +++ b/setup/docker.sh @@ -5,11 +5,11 @@ python3 -m venv build_venv source build_venv/bin/activate python3 -m pip install --upgrade pip -pip install dvc -pip install 'dvc[ssh]' -dvc pull +#pip install dvc +#pip install 'dvc[ssh]' +#dvc pull git submodule update --init --recursive -docker build -f Dockerfile_base -t fb-detr-base . -docker build -f Dockerfile -t fb-detr . +docker build -f Dockerfile_base -t image-prediction-base . +docker build -f Dockerfile -t image-prediction . diff --git a/sonar-project.properties b/sonar-project.properties index deefee9..4eb136e 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -1,4 +1,4 @@ -sonar.exclusions=bamboo-specs/**, **/test_data/**, **/detr/** +sonar.exclusions=bamboo-specs/**, **/test_data/** sonar.c.file.suffixes=- sonar.cpp.file.suffixes=- sonar.objc.file.suffixes=- diff --git a/src/serve.py b/src/serve.py index fa44ef9..65f6d04 100644 --- a/src/serve.py +++ b/src/serve.py @@ -5,8 +5,8 @@ from typing import Callable from flask import Flask, request, jsonify from waitress import serve -from fb_detr.config import CONFIG -from fb_detr.utils.estimator import suppress_userwarnings, initialize_predictor +from image_prediction.config import CONFIG +from image_prediction.utils.estimator import suppress_userwarnings, initialize_predictor def parse_args(): From 42ae5793e04fbc546b29a122e9718041380faa50 Mon Sep 17 00:00:00 2001 From: cdietrich Date: Tue, 1 Mar 2022 16:48:03 +0100 Subject: [PATCH 2/5] RED-3501: adapt service-container to image-service-v2 --- .gitignore | 2 + .gitmodules | 2 +- Dockerfile | 3 +- Dockerfile_base | 4 +- config.yaml | 26 +++++--- data/base_weights.h5.dvc | 4 ++ data/mlruns.dvc | 5 ++ image_prediction/locations.py | 17 +++-- image_prediction/predictor.py | 15 +++-- image_prediction/response.py | 71 +++++++++++++++++++++ image_prediction/utils/estimator.py | 32 ---------- incl/redai_image | 1 + setup/docker.sh | 6 +- src/serve.py | 97 ++++++++++++++++++----------- 14 files changed, 191 insertions(+), 94 deletions(-) create mode 100644 data/base_weights.h5.dvc create mode 100644 data/mlruns.dvc create mode 100644 image_prediction/response.py delete mode 100644 image_prediction/utils/estimator.py create mode 160000 incl/redai_image diff --git a/.gitignore b/.gitignore index 56faae6..a14b81f 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,5 @@ fabric.properties .idea/codestream.xml # End of https://www.toptal.com/developers/gitignore/api/linux,pycharm +/image_prediction/data/mlruns/ +/data/mlruns/ diff --git a/.gitmodules b/.gitmodules index 91435b0..1ee8d73 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "incl/redai_image"] path = incl/redai_image - url = ssh://git@git.iqser.com:2222/rr/redai_image.git \ No newline at end of file + url = ssh://git@git.iqser.com:2222/rr/redai_image.git diff --git a/Dockerfile b/Dockerfile index 7e336d2..fb00b3e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,14 @@ COPY image_prediction ./image_prediction COPY ./setup.py ./setup.py COPY ./requirements.txt ./requirements.txt COPY ./config.yaml ./config.yaml +COPY data data # Install dependencies differing from base image. RUN python3 -m pip install -r requirements.txt RUN python3 -m pip install -e . -WORKDIR /app/service/incl/redai_image +WORKDIR /app/service/incl/redai_image/redai RUN python3 -m pip install -e . WORKDIR /app/service diff --git a/Dockerfile_base b/Dockerfile_base index 4085902..c8decb7 100644 --- a/Dockerfile_base +++ b/Dockerfile_base @@ -10,11 +10,13 @@ RUN python -m pip install --upgrade pip # Make a directory for the service files and copy the service repo into the container. WORKDIR /app/service COPY ./requirements.txt ./requirements.txt -COPY ./data ./data +COPY ./incl/redai_image/redai/requirements_user.txt ./requirements_redai.txt # Install dependencies. RUN python3 -m pip install -r requirements.txt +RUN python3 -m pip install -r requirements_redai.txt + # Make a new container and copy all relevant files over to filter out temporary files # produced during setup to reduce the final container's size. FROM python:3.8 diff --git a/config.yaml b/config.yaml index 2a202b4..c48b4d6 100644 --- a/config.yaml +++ b/config.yaml @@ -1,10 +1,3 @@ -estimator: - checkpoint: checkpoint.pth - classes: ["logo", "other", "formula", "signature", "handwriting_other"] - rejection_class: "other" - threshold: .5 - device: cpu - webserver: host: $SERVER_HOST|"127.0.0.1" # webserver address port: $SERVER_PORT|5000 # webserver port @@ -14,3 +7,22 @@ service: logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for service logger batch_size: $BATCH_SIZE|2 # Number of images in memory simultaneously verbose: $VERBOSE|True # Service prints document processing progress to stdout + run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the model from + + +# These variables control filters that are applied to either images, image metadata or model predictions. The filter +# result values are reported in the service responses. For convenience the response to a request contains a +# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified +# required value. +filters: + + image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas) + min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible + max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible + + image_width_to_height_quotient: # Image width to height ratio + min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible + max: $MAX_IMAGE_FORMAT|10 # Maximum permissible + + min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence + diff --git a/data/base_weights.h5.dvc b/data/base_weights.h5.dvc new file mode 100644 index 0000000..9f07d13 --- /dev/null +++ b/data/base_weights.h5.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 6d0186c1f25e889d531788f168fa6cf0 + size: 16727296 + path: base_weights.h5 diff --git a/data/mlruns.dvc b/data/mlruns.dvc new file mode 100644 index 0000000..d390fed --- /dev/null +++ b/data/mlruns.dvc @@ -0,0 +1,5 @@ +outs: +- md5: d1c708270bab6fcd344d4a8b05d1103d.dir + size: 150225383 + nfiles: 178 + path: mlruns diff --git a/image_prediction/locations.py b/image_prediction/locations.py index 264cdda..9c67dc0 100644 --- a/image_prediction/locations.py +++ b/image_prediction/locations.py @@ -1,7 +1,14 @@ -from pathlib import Path +from os import path +MODULE_DIR = path.dirname(path.abspath(__file__)) +PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR) +REPO_ROOT_DIR = path.dirname(path.dirname(PACKAGE_ROOT_DIR)) -MODULE_ROOT = Path(__file__).resolve().parents[1] -CONFIG_FILE = MODULE_ROOT / "config.yaml" -DATA_DIR = MODULE_ROOT / "data" -TORCH_HOME = DATA_DIR +DOCKER_COMPOSE_FILE = path.join(REPO_ROOT_DIR, "docker-compose.yaml") + +CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml") +LOG_FILE = "/tmp/log.log" + +DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data") +MLRUNS_DIR = path.join(DATA_DIR, "mlruns") +BASE_WEIGHTS = path.join(DATA_DIR, "base_weights.h5") diff --git a/image_prediction/predictor.py b/image_prediction/predictor.py index 8320583..3d89a69 100644 --- a/image_prediction/predictor.py +++ b/image_prediction/predictor.py @@ -1,7 +1,13 @@ import logging from operator import itemgetter +from typing import List, Dict + +import numpy as np from image_prediction.config import CONFIG +from image_prediction.locations import MLRUNS_DIR, BASE_WEIGHTS +from incl.redai_image.redai.redai.backend.model.model_handle import ModelHandle +from incl.redai_image.redai.redai.utils.mlflow_reader import MlflowModelReader class Predictor: @@ -21,9 +27,7 @@ class Predictor: reader = MlflowModelReader( run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR ) - # message_queue.put(text="Loading model...", level=logging.DEBUG) self.model_handle = reader.get_model_handle(BASE_WEIGHTS) - # message_queue.put(text="Model loaded.", level=logging.DEBUG) else: self.model_handle = model_handle @@ -31,12 +35,7 @@ class Predictor: self.classes_readable = np.array(self.model_handle.classes) self.classes_readable_aligned = self.classes_readable[self.classes[list(range(len(self.classes)))]] except Exception as e: - message_queue.put( - text="Service estimator initialization failed.", - exception=e, - level=logging.CRITICAL, - trace=traceback.format_exc(), - ) + logging.info(f"Service estimator initialization failed: {e}") def __make_predictions_human_readable(self, probs: np.ndarray) -> List[Dict[str, float]]: """Translates an n x m matrix of probabilities over classes into an n-element list of mappings from classes to diff --git a/image_prediction/response.py b/image_prediction/response.py new file mode 100644 index 0000000..2fc3225 --- /dev/null +++ b/image_prediction/response.py @@ -0,0 +1,71 @@ +"""Defines functions for constructing service responses.""" + + +from itertools import starmap +from operator import itemgetter + +import numpy as np + +from image_prediction.config import CONFIG + + +def build_response(predictions: list, metadata: list) -> list: + return list(starmap(build_image_info, zip(predictions, metadata))) + + +def build_image_info(prediction: dict, metadata: dict) -> dict: + def compute_geometric_quotient(): + page_area_sqrt = np.sqrt(abs(page_width * page_height)) + image_area_sqrt = np.sqrt(abs(x2 - x1) * abs(y2 - y1)) + return image_area_sqrt / page_area_sqrt + + page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( + "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height" + )(metadata) + + quotient = compute_geometric_quotient() + + min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min) + max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max) + min_image_width_to_height_quotient_breached = bool( + width / height < CONFIG.filters.image_width_to_height_quotient.min + ) + max_image_width_to_height_quotient_breached = bool( + width / height > CONFIG.filters.image_width_to_height_quotient.max + ) + + min_confidence_breached = bool(max(prediction["probabilities"].values()) < CONFIG.filters.min_confidence) + prediction["label"] = prediction.pop("class") # "class" as field name causes problem for Java objectmapper + prediction["probabilities"] = {klass: np.round(prob, 6) for klass, prob in prediction["probabilities"].items()} + + image_info = { + "classification": prediction, + "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": metadata["page_idx"] + 1}, + "geometry": {"width": width, "height": height}, + "filters": { + "geometry": { + "imageSize": { + "quotient": quotient, + "tooLarge": max_image_to_page_quotient_breached, + "tooSmall": min_image_to_page_quotient_breached, + }, + "imageFormat": { + "quotient": width / height, + "tooTall": min_image_width_to_height_quotient_breached, + "tooWide": max_image_width_to_height_quotient_breached, + }, + }, + "probability": {"unconfident": min_confidence_breached}, + "allPassed": not any( + [ + max_image_to_page_quotient_breached, + min_image_to_page_quotient_breached, + min_image_width_to_height_quotient_breached, + max_image_width_to_height_quotient_breached, + min_confidence_breached, + ] + ), + }, + } + + return image_info diff --git a/image_prediction/utils/estimator.py b/image_prediction/utils/estimator.py deleted file mode 100644 index 113f02a..0000000 --- a/image_prediction/utils/estimator.py +++ /dev/null @@ -1,32 +0,0 @@ -import os - -from image_prediction.config import CONFIG -from image_prediction.locations import DATA_DIR, TORCH_HOME -from image_prediction.predictor import Predictor - - -def suppress_userwarnings(): - import warnings - - warnings.filterwarnings("ignore") - - -def load_classes(): - classes = CONFIG.estimator.classes - id2class = dict(zip(range(1, len(classes) + 1), classes)) - return id2class - - -def get_checkpoint(): - return DATA_DIR / CONFIG.estimator.checkpoint - - -def set_torch_env(): - os.environ["TORCH_HOME"] = str(TORCH_HOME) - - -def initialize_predictor(resume): - set_torch_env() - checkpoint = get_checkpoint() if not resume else resume - predictor = Predictor(checkpoint, classes=load_classes(), rejection_class=CONFIG.estimator.rejection_class) - return predictor diff --git a/incl/redai_image b/incl/redai_image new file mode 160000 index 0000000..4c3b26d --- /dev/null +++ b/incl/redai_image @@ -0,0 +1 @@ +Subproject commit 4c3b26d7673457aaa99e0663dad6950cd36da967 diff --git a/setup/docker.sh b/setup/docker.sh index b8cf880..7b4a837 100755 --- a/setup/docker.sh +++ b/setup/docker.sh @@ -5,9 +5,9 @@ python3 -m venv build_venv source build_venv/bin/activate python3 -m pip install --upgrade pip -#pip install dvc -#pip install 'dvc[ssh]' -#dvc pull +pip install dvc +pip install 'dvc[ssh]' +dvc pull git submodule update --init --recursive diff --git a/src/serve.py b/src/serve.py index 65f6d04..4c292d3 100644 --- a/src/serve.py +++ b/src/serve.py @@ -1,17 +1,29 @@ import argparse +import json import logging -from typing import Callable +import tempfile +from itertools import chain +from operator import itemgetter +from typing import Iterable from flask import Flask, request, jsonify from waitress import serve from image_prediction.config import CONFIG -from image_prediction.utils.estimator import suppress_userwarnings, initialize_predictor +from image_prediction.predictor import Predictor +from image_prediction.response import build_response +from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch +from incl.redai_image.redai.redai.utils.shared import chunk_iterable + + +def suppress_userwarnings(): + import warnings + + warnings.filterwarnings("ignore") def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--resume") parser.add_argument("--warnings", action="store_true", default=False) args = parser.parse_args() @@ -22,16 +34,9 @@ def main(args): if not args.warnings: suppress_userwarnings() - predictor = initialize_predictor(args.resume) + predictor = Predictor() logging.info("Predictor ready.") - prediction_server = make_prediction_server(predictor.predict_pdf) - - run_prediction_server(prediction_server, mode=CONFIG.webserver.mode) - - -def make_prediction_server(predict_fn: Callable): - app = Flask(__name__) @app.route("/ready", methods=["GET"]) @@ -48,46 +53,66 @@ def make_prediction_server(predict_fn: Callable): @app.route("/", methods=["POST"]) def predict(): - def __predict(): - - def inner(): - - pdf = request.data - - logging.debug("Running predictor on document...") - predictions = predict_fn(pdf) - logging.debug(f"Found {len(predictions)} images in document.") - response = jsonify(list(predictions)) + pdf = request.data + logging.debug("Running predictor on document...") + # extract images from pdfs + with tempfile.NamedTemporaryFile() as tmp_file: + tmp_file.write(pdf) + image_metadata_pairs = extract_image_metadata_pairs(tmp_file.name) + try: + predictions, metadata = classify_images(predictor, image_metadata_pairs) + except Exception as err: + logging.warning("Analysis failed.") + logging.exception(err) + response = jsonify("Analysis failed.") + response.status_code = 500 return response + logging.debug(f"Found images in document.") - logging.info(f"Analyzing...") - result = inner() - logging.info("Analysis completed.") - return result + response = jsonify(build_response(list(predictions), list(metadata))) - try: - return __predict() - except Exception as err: - logging.warning("Analysis failed.") - logging.exception(err) - response = jsonify("Analysis failed.") - response.status_code = 500 - return response + logging.info("Analysis completed.") + return response - return app + run_prediction_server(app, mode=CONFIG.webserver.mode) def run_prediction_server(app, mode="development"): - if mode == "development": app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True) elif mode == "production": serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port) -if __name__ == "__main__": +def extract_image_metadata_pairs(pdf_path: str, **kwargs): + def image_is_large_enough(metadata: dict): + x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata) + return abs(x1 - x2) > 2 and abs(y1 - y2) > 2 + + yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs) + + +def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size): + def process_chunk(chunk): + images, metadata = zip(*chunk) + predictions = predictor.predict(images, probabilities=True) + return predictions, metadata + + def predict(image_metadata_pair_generator): + chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size) + return map(chain.from_iterable, zip(*map(process_chunk, chunks))) + + try: + predictions, metadata = predict(image_metadata_pairs) + return predictions, metadata + + except ValueError: + return [], [] + + +if __name__ == "__main__": logging_level = CONFIG.service.logging_level logging.basicConfig(level=logging_level) logging.getLogger("flask").setLevel(logging.ERROR) From 372d6645d75d8e6c6fc00aa150a5bb8a75650146 Mon Sep 17 00:00:00 2001 From: cdietrich Date: Wed, 2 Mar 2022 10:15:08 +0100 Subject: [PATCH 3/5] tidy up repo --- Dockerfile_base | 4 - README.md | 2 +- .../src/main/java/buildjob/PlanSpec.java | 2 +- .../src/main/resources/scripts/sonar-scan.sh | 8 +- docker-compose.yaml | 10 -- image_prediction/utils/__init__.py | 0 image_prediction/utils/non_max_supprs.py | 96 ------------------- image_prediction/utils/stream.py | 20 ---- scripts/client_mock.py | 58 ----------- scripts/flask_test.py | 35 ------- scripts/pyinfra_mock.py | 26 +++++ 11 files changed, 32 insertions(+), 229 deletions(-) delete mode 100644 docker-compose.yaml delete mode 100644 image_prediction/utils/__init__.py delete mode 100644 image_prediction/utils/non_max_supprs.py delete mode 100644 image_prediction/utils/stream.py delete mode 100644 scripts/client_mock.py delete mode 100644 scripts/flask_test.py create mode 100644 scripts/pyinfra_mock.py diff --git a/Dockerfile_base b/Dockerfile_base index c8decb7..81639d5 100644 --- a/Dockerfile_base +++ b/Dockerfile_base @@ -26,7 +26,3 @@ COPY --from=builder1 /app . ENV PATH="/app/venv/bin:$PATH" WORKDIR /app/service - -RUN apt update --yes -RUN apt install vim --yes -RUN apt install poppler-utils --yes diff --git a/README.md b/README.md index 41e2f56..f913627 100644 --- a/README.md +++ b/README.md @@ -21,5 +21,5 @@ docker run --rm --net=host --rm image-prediction Shell 2 ```bash -python scripts/client_mock.py --pdf_path /path/to/a/pdf +python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf ``` diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index adba0f2..4a4d308 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -85,7 +85,7 @@ public class PlanSpec { .description("Checkout default repository.") .checkoutItems(new CheckoutItem().defaultRepository()), new VcsCheckoutTask() - .description("Checkout detr research repository.") + .description("Checkout redai_image research repository.") .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")), new ScriptTask() .description("Set config and keys.") diff --git a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh index 693f216..febc142 100755 --- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh +++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh @@ -10,7 +10,7 @@ python3 -m pip install --upgrade pip echo "dev setup for unit test and coverage 💖" pip install -e . -pip install -e incl/detr +pip install -e incl/redai_image pip install -r requirements.txt SERVICE_NAME=$1 @@ -19,14 +19,14 @@ echo "dependency-check:aggregate" mkdir -p reports dependency-check --enableExperimental -f JSON -f HTML -f XML \ --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \ - --exclude "build_venv/**" --exclude "**/__pycache__/**" + --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**" if [[ -z "${bamboo_repository_pr_key}" ]] then echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}" /usr/bin/sonar-scanner/bin/sonar-scanner \ -Dsonar.projectKey=RED_$SERVICE_NAME \ - -Dsonar.sources=incl/image_service/image_service,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \ + -Dsonar.sources=image_prediction,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \ -Dsonar.host.url=https://sonarqube.iqser.com \ -Dsonar.login=${bamboo_sonarqube_api_token_secret} \ -Dsonar.branch.name=${bamboo_planRepository_1_branch} \ @@ -39,7 +39,7 @@ else echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}" /usr/bin/sonar-scanner/bin/sonar-scanner \ -Dsonar.projectKey=RED_$SERVICE_NAME \ - -Dsonar.sources=incl/image_service/image_service,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \ + -Dsonar.sources=image_prediction,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \ -Dsonar.host.url=https://sonarqube.iqser.com \ -Dsonar.login=${bamboo_sonarqube_api_token_secret} \ -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \ diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index 0afb54b..0000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,10 +0,0 @@ -version: "3.3" -services: - detr-server: - image: detr-server - network_mode: "host" - read_only: true - volumes: - - tmp:/tmp:rw -volumes: - tmp: diff --git a/image_prediction/utils/__init__.py b/image_prediction/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/image_prediction/utils/non_max_supprs.py b/image_prediction/utils/non_max_supprs.py deleted file mode 100644 index f38a63e..0000000 --- a/image_prediction/utils/non_max_supprs.py +++ /dev/null @@ -1,96 +0,0 @@ -from collections import namedtuple -from itertools import starmap, combinations -from operator import attrgetter, itemgetter - -from frozendict import frozendict - -Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") - - -def make_box(x1, y1, x2, y2): - keys = "x1", "y1", "x2", "y2" - return dict(zip(keys, [x1, y1, x2, y2])) - - -def compute_intersection(a, b): - - a = Rectangle(*a.values()) - b = Rectangle(*b.values()) - - dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin) - dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin) - - return dx * dy if (dx >= 0) and (dy >= 0) else 0 - - -def compute_union(a, b): - def area(box): - r = Rectangle(*box.values()) - return (r.xmax - r.xmin) * (r.ymax - r.ymin) - - return (area(a) + area(b)) - compute_intersection(a, b) - - -def compute_iou(a, b): - return compute_intersection(a, b) / compute_union(a, b) - - -LPBox = namedtuple("LPBox", "label proba box") - - -def less_likely(a, b): - return min([a, b], key=attrgetter("proba")) - - -def overlap_too_much(a, b, iou_thresh): - iou = compute_iou(a.box, b.box) - return iou > iou_thresh - - -def __greedy_non_max_supprs(lpboxes, iou_thresh=0.1): - def remove_less_likely(a, b): - try: - ll = less_likely(a, b) - current_boxes.remove(ll) - except KeyError: - pass - - current_boxes = {*lpboxes} - - while True: - n = len(current_boxes) - for a, b in combinations(current_boxes, r=2): - if len({a, b} & current_boxes) != 2: - continue - if overlap_too_much(a, b, iou_thresh): - remove_less_likely(a, b) - - if n == len(current_boxes): - break - - return current_boxes - - -def lpboxes_to_dict(lpboxes): - - boxes = map(dict, map(attrgetter("box"), lpboxes)) - classes = map(attrgetter("label"), lpboxes) - probas = map(attrgetter("proba"), lpboxes) - - boxes, classes, probas = map(list, [boxes, classes, probas]) - - return {"bboxes": boxes, "classes": classes, "probas": probas} - - -def greedy_non_max_supprs(predictions): - - boxes, classes, probas = itemgetter("bboxes", "classes", "probas")(predictions) - boxes = map(frozendict, boxes) - lpboxes = list(starmap(LPBox, zip(classes, probas, boxes))) - - lpboxes = __greedy_non_max_supprs(lpboxes) - - merged_predictions = lpboxes_to_dict(lpboxes) - predictions.update(merged_predictions) - - return predictions diff --git a/image_prediction/utils/stream.py b/image_prediction/utils/stream.py deleted file mode 100644 index d9948a3..0000000 --- a/image_prediction/utils/stream.py +++ /dev/null @@ -1,20 +0,0 @@ -from itertools import takewhile, starmap, islice, repeat -from operator import truth - -from pdf2image import pdf2image - - -def chunk_iterable(iterable, n): - return takewhile(truth, map(tuple, starmap(islice, repeat((iter(iterable), n))))) - - -def get_page_count(pdf): - return pdf2image.pdfinfo_from_bytes(pdf)["Pages"] - - -def stream_pages(pdf): - def page_to_image(idx): - return pdf2image.convert_from_bytes(pdf, first_page=idx, last_page=idx + 1)[0] - - page_count = get_page_count(pdf) - return map(page_to_image, range(page_count)) diff --git a/scripts/client_mock.py b/scripts/client_mock.py deleted file mode 100644 index 7d26000..0000000 --- a/scripts/client_mock.py +++ /dev/null @@ -1,58 +0,0 @@ -import argparse -import json -from operator import itemgetter - -import pdf2image -import requests -from PIL import ImageDraw, ImageFont - - -def draw_coco_box(draw: ImageDraw.Draw, bbox, klass, proba): - x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox) - draw.rectangle(((x1, y1), (x2, y2)), outline="red") - - fnt = ImageFont.truetype("Pillow/Tests/fonts/FreeMono.ttf", 30) - - draw.text((x1, y2), text=f"{klass}: {proba:.2f}", fill=(0, 0, 0, 100), font=fnt) - - -def draw_coco_boxes(image, bboxes, classes, probas): - - draw = ImageDraw.Draw(image) - for bbox, klass, proba in zip(bboxes, classes, probas): - draw_coco_box(draw, bbox, klass, proba) - - return image - - -def annotate(pdf_path, predictions): - pages = pdf2image.convert_from_path(pdf_path) - - for prd in predictions: - page_idx, boxes, classes, probas = itemgetter("page_idx", "bboxes", "classes", "probas")(prd) - page = pages[page_idx] - image = draw_coco_boxes(page, boxes, classes, probas) - image.save(f"/tmp/serv_out/{page_idx}.png") - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--pdf_path", required=True) - args = parser.parse_args() - - return args - - -def main(args): - - response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb")) - response.raise_for_status() - predictions = response.json() - - print(json.dumps(predictions, indent=2)) - annotate(args.pdf_path, predictions) - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/scripts/flask_test.py b/scripts/flask_test.py deleted file mode 100644 index ba95c12..0000000 --- a/scripts/flask_test.py +++ /dev/null @@ -1,35 +0,0 @@ -import argparse - -from PIL import Image -from flask import Flask, request, jsonify -from pathlib import Path - -app = Flask(__name__) - - -@app.before_first_request -def init(): - from image_prediction.predictor import Predictor - - global PRED - - PRED = Predictor(args.resume) - - -@app.route("/", methods=["GET", "POST"]) -def predict_request(): - if request.method == "POST": - image_folder_path = request.form.get("image_folder_path") - images = list(map(Image.open, Path(image_folder_path).glob("*.png"))) - results = PRED.predict(images, format_output=True) - for result in results: - return jsonify(result) - if request.method == "GET": - return "Not implemented" - - -parser = argparse.ArgumentParser() -parser.add_argument("--resume", required=True) -args = parser.parse_args() - -app.run() diff --git a/scripts/pyinfra_mock.py b/scripts/pyinfra_mock.py new file mode 100644 index 0000000..fec12e9 --- /dev/null +++ b/scripts/pyinfra_mock.py @@ -0,0 +1,26 @@ +import argparse +import json + +import requests + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--pdf_path", required=True) + args = parser.parse_args() + + return args + + +def main(args): + + response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb")) + response.raise_for_status() + predictions = response.json() + + print(json.dumps(predictions, indent=2)) + + +if __name__ == "__main__": + args = parse_args() + main(args) From 1e5da128f101245277cb9778c6bc5dc5fdf2f585 Mon Sep 17 00:00:00 2001 From: cdietrich Date: Wed, 2 Mar 2022 10:24:33 +0100 Subject: [PATCH 4/5] refactor --- image_prediction/predictor.py | 32 ++++++++++++++++++- src/serve.py | 59 ++--------------------------------- 2 files changed, 34 insertions(+), 57 deletions(-) diff --git a/image_prediction/predictor.py b/image_prediction/predictor.py index 3d89a69..d994baf 100644 --- a/image_prediction/predictor.py +++ b/image_prediction/predictor.py @@ -1,13 +1,16 @@ import logging +from itertools import chain from operator import itemgetter -from typing import List, Dict +from typing import List, Dict, Iterable import numpy as np from image_prediction.config import CONFIG from image_prediction.locations import MLRUNS_DIR, BASE_WEIGHTS from incl.redai_image.redai.redai.backend.model.model_handle import ModelHandle +from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch from incl.redai_image.redai.redai.utils.mlflow_reader import MlflowModelReader +from incl.redai_image.redai.redai.utils.shared import chunk_iterable class Predictor: @@ -86,3 +89,30 @@ class Predictor: predictions = [{"class": c, "probabilities": c2p} for c, c2p in zip(classes, class2prob_per_item)] return predictions if probabilities else classes + + +def extract_image_metadata_pairs(pdf_path: str, **kwargs): + def image_is_large_enough(metadata: dict): + x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata) + + return abs(x1 - x2) > 2 and abs(y1 - y2) > 2 + + yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs) + + +def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size): + def process_chunk(chunk): + images, metadata = zip(*chunk) + predictions = predictor.predict(images, probabilities=True) + return predictions, metadata + + def predict(image_metadata_pair_generator): + chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size) + return map(chain.from_iterable, zip(*map(process_chunk, chunks))) + + try: + predictions, metadata = predict(image_metadata_pairs) + return predictions, metadata + + except ValueError: + return [], [] diff --git a/src/serve.py b/src/serve.py index 4c292d3..bc6bae2 100644 --- a/src/serve.py +++ b/src/serve.py @@ -1,38 +1,15 @@ -import argparse -import json import logging import tempfile -from itertools import chain -from operator import itemgetter -from typing import Iterable from flask import Flask, request, jsonify from waitress import serve from image_prediction.config import CONFIG -from image_prediction.predictor import Predictor +from image_prediction.predictor import Predictor, extract_image_metadata_pairs, classify_images from image_prediction.response import build_response -from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch -from incl.redai_image.redai.redai.utils.shared import chunk_iterable -def suppress_userwarnings(): - import warnings - - warnings.filterwarnings("ignore") - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--warnings", action="store_true", default=False) - args = parser.parse_args() - - return args - - -def main(args): - if not args.warnings: - suppress_userwarnings() +def main(): predictor = Predictor() logging.info("Predictor ready.") @@ -56,7 +33,6 @@ def main(args): pdf = request.data logging.debug("Running predictor on document...") - # extract images from pdfs with tempfile.NamedTemporaryFile() as tmp_file: tmp_file.write(pdf) image_metadata_pairs = extract_image_metadata_pairs(tmp_file.name) @@ -85,33 +61,6 @@ def run_prediction_server(app, mode="development"): serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port) -def extract_image_metadata_pairs(pdf_path: str, **kwargs): - def image_is_large_enough(metadata: dict): - x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata) - - return abs(x1 - x2) > 2 and abs(y1 - y2) > 2 - - yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs) - - -def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size): - def process_chunk(chunk): - images, metadata = zip(*chunk) - predictions = predictor.predict(images, probabilities=True) - return predictions, metadata - - def predict(image_metadata_pair_generator): - chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size) - return map(chain.from_iterable, zip(*map(process_chunk, chunks))) - - try: - predictions, metadata = predict(image_metadata_pairs) - return predictions, metadata - - except ValueError: - return [], [] - - if __name__ == "__main__": logging_level = CONFIG.service.logging_level logging.basicConfig(level=logging_level) @@ -120,6 +69,4 @@ if __name__ == "__main__": logging.getLogger("werkzeug").setLevel(logging.ERROR) logging.getLogger("waitress").setLevel(logging.ERROR) - args = parse_args() - - main(args) + main() From 8a1df76078cb05ded0152ae4a8d8af275a25bc4e Mon Sep 17 00:00:00 2001 From: cdietrich Date: Wed, 2 Mar 2022 10:28:32 +0100 Subject: [PATCH 5/5] blacked --- image_prediction/predictor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/image_prediction/predictor.py b/image_prediction/predictor.py index d994baf..4450e1a 100644 --- a/image_prediction/predictor.py +++ b/image_prediction/predictor.py @@ -27,9 +27,7 @@ class Predictor: """ try: if model_handle is None: - reader = MlflowModelReader( - run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR - ) + reader = MlflowModelReader(run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR) self.model_handle = reader.get_model_handle(BASE_WEIGHTS) else: self.model_handle = model_handle