Pull request #2: Integrate image service
Merge in RR/image-prediction from integrate_image_service to master * commit '8a1df76078cb05ded0152ae4a8d8af275a25bc4e': blacked refactor tidy up repo RED-3501: adapt service-container to image-service-v2 adapt service-container to image-service-v2
This commit is contained in:
commit
77fec1801c
1
.dvc/.gitignore
vendored
1
.dvc/.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
/config.local
|
||||
/tmp
|
||||
/cache
|
||||
/plots/
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
[core]
|
||||
remote = vector
|
||||
autostage = true
|
||||
['remote "vector"']
|
||||
url = ssh://vector.iqser.com/research/detr_server/
|
||||
url = ssh://vector.iqser.com/research/image_service/
|
||||
port = 22
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -171,3 +171,5 @@ fabric.properties
|
||||
.idea/codestream.xml
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
||||
/image_prediction/data/mlruns/
|
||||
/data/mlruns/
|
||||
|
||||
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -1,3 +1,3 @@
|
||||
[submodule "incl/detr"]
|
||||
path = incl/detr
|
||||
url = ssh://git@git.iqser.com:2222/rr/detr.git
|
||||
[submodule "incl/redai_image"]
|
||||
path = incl/redai_image
|
||||
url = ssh://git@git.iqser.com:2222/rr/redai_image.git
|
||||
|
||||
@ -1,23 +1,24 @@
|
||||
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
|
||||
ARG VERSION_TAG="latest"
|
||||
|
||||
FROM ${BASE_ROOT}fb-detr-base:${VERSION_TAG}
|
||||
FROM ${BASE_ROOT}image-prediction-base:${VERSION_TAG}
|
||||
|
||||
WORKDIR /app/service
|
||||
|
||||
COPY ./src ./src
|
||||
COPY ./incl/detr ./incl/detr
|
||||
COPY ./fb_detr ./fb_detr
|
||||
COPY ./incl/redai_image ./incl/redai_image
|
||||
COPY image_prediction ./image_prediction
|
||||
COPY ./setup.py ./setup.py
|
||||
COPY ./requirements.txt ./requirements.txt
|
||||
COPY ./config.yaml ./config.yaml
|
||||
COPY data data
|
||||
|
||||
# Install dependencies differing from base image.
|
||||
RUN python3 -m pip install -r requirements.txt
|
||||
|
||||
RUN python3 -m pip install -e .
|
||||
|
||||
WORKDIR /app/service/incl/detr
|
||||
WORKDIR /app/service/incl/redai_image/redai
|
||||
RUN python3 -m pip install -e .
|
||||
WORKDIR /app/service
|
||||
|
||||
|
||||
@ -10,11 +10,13 @@ RUN python -m pip install --upgrade pip
|
||||
# Make a directory for the service files and copy the service repo into the container.
|
||||
WORKDIR /app/service
|
||||
COPY ./requirements.txt ./requirements.txt
|
||||
COPY ./data ./data
|
||||
COPY ./incl/redai_image/redai/requirements_user.txt ./requirements_redai.txt
|
||||
|
||||
# Install dependencies.
|
||||
RUN python3 -m pip install -r requirements.txt
|
||||
|
||||
RUN python3 -m pip install -r requirements_redai.txt
|
||||
|
||||
# Make a new container and copy all relevant files over to filter out temporary files
|
||||
# produced during setup to reduce the final container's size.
|
||||
FROM python:3.8
|
||||
@ -24,7 +26,3 @@ COPY --from=builder1 /app .
|
||||
ENV PATH="/app/venv/bin:$PATH"
|
||||
|
||||
WORKDIR /app/service
|
||||
|
||||
RUN apt update --yes
|
||||
RUN apt install vim --yes
|
||||
RUN apt install poppler-utils --yes
|
||||
|
||||
@ -7,7 +7,7 @@ setup/docker.sh
|
||||
|
||||
Build head image
|
||||
```bash
|
||||
docker build -f Dockerfile -t detr-server . --build-arg BASE_ROOT=""
|
||||
docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT=""
|
||||
```
|
||||
|
||||
### Usage
|
||||
@ -15,11 +15,11 @@ docker build -f Dockerfile -t detr-server . --build-arg BASE_ROOT=""
|
||||
Shell 1
|
||||
|
||||
```bash
|
||||
docker run --rm --net=host --rm detr-server
|
||||
docker run --rm --net=host --rm image-prediction
|
||||
```
|
||||
|
||||
Shell 2
|
||||
|
||||
```bash
|
||||
python scripts/client_mock.py --pdf_path /path/to/a/pdf
|
||||
python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf
|
||||
```
|
||||
|
||||
@ -33,8 +33,8 @@ import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
|
||||
@BambooSpec
|
||||
public class PlanSpec {
|
||||
|
||||
private static final String SERVICE_NAME = "fb-detr";
|
||||
private static final String SERVICE_NAME_BASE = "fb-detr-base";
|
||||
private static final String SERVICE_NAME = "image-prediction";
|
||||
private static final String SERVICE_NAME_BASE = "image-prediction-base";
|
||||
|
||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
||||
|
||||
@ -72,7 +72,7 @@ public class PlanSpec {
|
||||
return new Plan(
|
||||
project(),
|
||||
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
||||
.description("Docker build for fb-detr.")
|
||||
.description("Docker build for image-prediction.")
|
||||
// .variables()
|
||||
.stages(new Stage("Build Stage")
|
||||
.jobs(
|
||||
@ -85,8 +85,8 @@ public class PlanSpec {
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout detr research repository.")
|
||||
.checkoutItems(new CheckoutItem().repository("RR / DETR").path("DETR")),
|
||||
.description("Checkout redai_image research repository.")
|
||||
.checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
|
||||
new ScriptTask()
|
||||
.description("Set config and keys.")
|
||||
.inlineBody("mkdir -p ~/.ssh\n" +
|
||||
@ -112,8 +112,8 @@ public class PlanSpec {
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout detr research repository.")
|
||||
.checkoutItems(new CheckoutItem().repository("RR / DETR").path("DETR")),
|
||||
.description("Checkout redai_image repository.")
|
||||
.checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
|
||||
new ScriptTask()
|
||||
.description("Set config and keys.")
|
||||
.inlineBody("mkdir -p ~/.ssh\n" +
|
||||
@ -174,7 +174,7 @@ public class PlanSpec {
|
||||
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||
.linkedRepositories("RR / DETR")
|
||||
.linkedRepositories("RR / redai_image")
|
||||
.triggers(new BitbucketServerTrigger())
|
||||
.planBranchManagement(new PlanBranchManagement()
|
||||
.createForVcsBranch()
|
||||
|
||||
@ -10,7 +10,7 @@ python3 -m pip install --upgrade pip
|
||||
echo "dev setup for unit test and coverage 💖"
|
||||
|
||||
pip install -e .
|
||||
pip install -e incl/detr
|
||||
pip install -e incl/redai_image
|
||||
pip install -r requirements.txt
|
||||
|
||||
SERVICE_NAME=$1
|
||||
@ -19,14 +19,14 @@ echo "dependency-check:aggregate"
|
||||
mkdir -p reports
|
||||
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
||||
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
||||
--exclude "build_venv/**" --exclude "**/__pycache__/**"
|
||||
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
|
||||
|
||||
if [[ -z "${bamboo_repository_pr_key}" ]]
|
||||
then
|
||||
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
|
||||
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||
-Dsonar.sources=incl/image_service/image_service,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \
|
||||
-Dsonar.sources=image_prediction,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \
|
||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
|
||||
@ -39,7 +39,7 @@ else
|
||||
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
|
||||
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||
-Dsonar.sources=incl/image_service/image_service,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \
|
||||
-Dsonar.sources=image_prediction,incl/redai_image/redai/redai/backend,incl/redai_image/redai/redai/utils,src,incl/redai_image/redai/redai/model/efficientnetb0mod.py \
|
||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
|
||||
|
||||
26
config.yaml
26
config.yaml
@ -1,10 +1,3 @@
|
||||
estimator:
|
||||
checkpoint: checkpoint.pth
|
||||
classes: ["logo", "other", "formula", "signature", "handwriting_other"]
|
||||
rejection_class: "other"
|
||||
threshold: .5
|
||||
device: cpu
|
||||
|
||||
webserver:
|
||||
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
||||
port: $SERVER_PORT|5000 # webserver port
|
||||
@ -14,3 +7,22 @@ service:
|
||||
logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for service logger
|
||||
batch_size: $BATCH_SIZE|2 # Number of images in memory simultaneously
|
||||
verbose: $VERBOSE|True # Service prints document processing progress to stdout
|
||||
run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the model from
|
||||
|
||||
|
||||
# These variables control filters that are applied to either images, image metadata or model predictions. The filter
|
||||
# result values are reported in the service responses. For convenience the response to a request contains a
|
||||
# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
|
||||
# required value.
|
||||
filters:
|
||||
|
||||
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
|
||||
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
|
||||
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
|
||||
|
||||
image_width_to_height_quotient: # Image width to height ratio
|
||||
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
|
||||
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
|
||||
|
||||
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence
|
||||
|
||||
|
||||
1
data/.gitignore
vendored
1
data/.gitignore
vendored
@ -1 +0,0 @@
|
||||
/checkpoint.pth
|
||||
4
data/base_weights.h5.dvc
Normal file
4
data/base_weights.h5.dvc
Normal file
@ -0,0 +1,4 @@
|
||||
outs:
|
||||
- md5: 6d0186c1f25e889d531788f168fa6cf0
|
||||
size: 16727296
|
||||
path: base_weights.h5
|
||||
@ -1,4 +0,0 @@
|
||||
outs:
|
||||
- md5: 9face65530febd41a0722e0513da2264
|
||||
size: 496696129
|
||||
path: checkpoint.pth
|
||||
1
data/hub/checkpoints/.gitignore
vendored
1
data/hub/checkpoints/.gitignore
vendored
@ -1 +0,0 @@
|
||||
/resnet50-0676ba61.pth
|
||||
@ -1,4 +0,0 @@
|
||||
outs:
|
||||
- md5: b94941323912291bb67db6fdb1d80c11
|
||||
size: 102530333
|
||||
path: resnet50-0676ba61.pth
|
||||
5
data/mlruns.dvc
Normal file
5
data/mlruns.dvc
Normal file
@ -0,0 +1,5 @@
|
||||
outs:
|
||||
- md5: d1c708270bab6fcd344d4a8b05d1103d.dir
|
||||
size: 150225383
|
||||
nfiles: 178
|
||||
path: mlruns
|
||||
@ -1,10 +0,0 @@
|
||||
version: "3.3"
|
||||
services:
|
||||
detr-server:
|
||||
image: detr-server
|
||||
network_mode: "host"
|
||||
read_only: true
|
||||
volumes:
|
||||
- tmp:/tmp:rw
|
||||
volumes:
|
||||
tmp:
|
||||
@ -1,7 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
MODULE_ROOT = Path(__file__).resolve().parents[1]
|
||||
CONFIG_FILE = MODULE_ROOT / "config.yaml"
|
||||
DATA_DIR = MODULE_ROOT / "data"
|
||||
TORCH_HOME = DATA_DIR
|
||||
@ -1,162 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
from itertools import compress, starmap, chain
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import torch
|
||||
from iteration_utilities import starfilter
|
||||
from tqdm import tqdm
|
||||
|
||||
from detr.models import build_model
|
||||
from detr.prediction import get_args_parser, infer
|
||||
from fb_detr.config import CONFIG
|
||||
from fb_detr.utils.non_max_supprs import greedy_non_max_supprs
|
||||
from fb_detr.utils.stream import stream_pages, chunk_iterable, get_page_count
|
||||
|
||||
|
||||
def load_model(checkpoint_path):
|
||||
|
||||
parser = argparse.ArgumentParser(parents=[get_args_parser()])
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.output_dir:
|
||||
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
device = torch.device(CONFIG.estimator.device)
|
||||
|
||||
model, _, _ = build_model(args)
|
||||
|
||||
checkpoint = torch.load(checkpoint_path, map_location="cpu")
|
||||
model.load_state_dict(checkpoint["model"])
|
||||
|
||||
model.to(device)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class Predictor:
|
||||
def __init__(self, checkpoint_path, classes=None, rejection_class=None):
|
||||
self.model = load_model(checkpoint_path)
|
||||
self.classes = classes
|
||||
self.rejection_class = rejection_class
|
||||
|
||||
@staticmethod
|
||||
def __format_boxes(boxes):
|
||||
|
||||
keys = "x1", "y1", "x2", "y2"
|
||||
|
||||
x1s = boxes[:, 0].tolist()
|
||||
y1s = boxes[:, 1].tolist()
|
||||
x2s = boxes[:, 2].tolist()
|
||||
y2s = boxes[:, 3].tolist()
|
||||
|
||||
boxes = [dict(zip(keys, vs)) for vs in zip(x1s, y1s, x2s, y2s)]
|
||||
|
||||
return boxes
|
||||
|
||||
@staticmethod
|
||||
def __normalize_to_list(maybe_multiple):
|
||||
return maybe_multiple if isinstance(maybe_multiple, tuple) else tuple([maybe_multiple])
|
||||
|
||||
def __format_classes(self, classes):
|
||||
if self.classes:
|
||||
return self.__normalize_to_list(itemgetter(*classes.tolist())(self.classes))
|
||||
else:
|
||||
return classes.tolist()
|
||||
|
||||
@staticmethod
|
||||
def __format_probas(probas):
|
||||
return probas.max(axis=1).tolist()
|
||||
|
||||
def __format_prediction(self, predictions: dict):
|
||||
|
||||
boxes, classes, probas = itemgetter("bboxes", "classes", "probas")(predictions)
|
||||
|
||||
if len(boxes):
|
||||
boxes = self.__format_boxes(boxes)
|
||||
classes = self.__format_classes(classes)
|
||||
probas = self.__format_probas(probas)
|
||||
else:
|
||||
boxes, classes, probas = [], [], []
|
||||
|
||||
predictions["bboxes"] = boxes
|
||||
predictions["classes"] = classes
|
||||
predictions["probas"] = probas
|
||||
|
||||
return predictions
|
||||
|
||||
def __filter_predictions_for_image(self, predictions):
|
||||
|
||||
boxes, classes, probas = itemgetter("bboxes", "classes", "probas")(predictions)
|
||||
|
||||
if boxes:
|
||||
keep = map(lambda c: c != self.rejection_class, classes)
|
||||
compressed = list(compress(zip(boxes, classes, probas), keep))
|
||||
boxes, classes, probas = map(list, zip(*compressed)) if compressed else ([], [], [])
|
||||
predictions["bboxes"] = boxes
|
||||
predictions["classes"] = classes
|
||||
predictions["probas"] = probas
|
||||
|
||||
return predictions
|
||||
|
||||
def filter_predictions(self, predictions):
|
||||
def detections_present(_, prediction):
|
||||
return bool(prediction["classes"])
|
||||
|
||||
# TODO: set page_idx even when not filtering
|
||||
def build_return_dict(page_idx, predictions):
|
||||
return {"page_idx": page_idx, **predictions}
|
||||
|
||||
filtered_rejections = map(self.__filter_predictions_for_image, predictions)
|
||||
filtered_no_detections = starfilter(detections_present, enumerate(filtered_rejections))
|
||||
filtered_no_detections = starmap(build_return_dict, filtered_no_detections)
|
||||
|
||||
return filtered_no_detections
|
||||
|
||||
def format_predictions(self, outputs: Iterable):
|
||||
return map(self.__format_prediction, outputs)
|
||||
|
||||
def __non_max_supprs(self, predictions):
|
||||
predictions = map(greedy_non_max_supprs, predictions)
|
||||
return predictions
|
||||
|
||||
def predict(self, images, threshold=None):
|
||||
|
||||
if not threshold:
|
||||
threshold = CONFIG.estimator.threshold
|
||||
|
||||
predictions = infer(images, self.model, CONFIG.estimator.device, threshold)
|
||||
predictions = self.format_predictions(predictions)
|
||||
if self.rejection_class:
|
||||
predictions = self.filter_predictions(predictions)
|
||||
|
||||
predictions = self.__non_max_supprs(predictions)
|
||||
|
||||
predictions = list(predictions)
|
||||
|
||||
return predictions
|
||||
|
||||
def predict_pdf(self, pdf: bytes):
|
||||
def progress(generator):
|
||||
|
||||
page_count = get_page_count(pdf)
|
||||
batch_count = int(page_count / CONFIG.service.batch_size)
|
||||
|
||||
yield from tqdm(
|
||||
generator, total=batch_count, position=1, leave=True
|
||||
) if CONFIG.service.verbose else generator
|
||||
|
||||
def predict_batch(batch_idx, batch):
|
||||
predictions = self.predict(batch)
|
||||
for p in predictions:
|
||||
p["page_idx"] += batch_idx
|
||||
|
||||
return predictions
|
||||
|
||||
page_stream = stream_pages(pdf)
|
||||
page_batches = chunk_iterable(page_stream, CONFIG.service.batch_size)
|
||||
predictions = list(chain(*starmap(predict_batch, progress(enumerate(page_batches)))))
|
||||
|
||||
return predictions
|
||||
@ -1,32 +0,0 @@
|
||||
import os
|
||||
|
||||
from fb_detr.config import CONFIG
|
||||
from fb_detr.locations import DATA_DIR, TORCH_HOME
|
||||
from fb_detr.predictor import Predictor
|
||||
|
||||
|
||||
def suppress_userwarnings():
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
|
||||
def load_classes():
|
||||
classes = CONFIG.estimator.classes
|
||||
id2class = dict(zip(range(1, len(classes) + 1), classes))
|
||||
return id2class
|
||||
|
||||
|
||||
def get_checkpoint():
|
||||
return DATA_DIR / CONFIG.estimator.checkpoint
|
||||
|
||||
|
||||
def set_torch_env():
|
||||
os.environ["TORCH_HOME"] = str(TORCH_HOME)
|
||||
|
||||
|
||||
def initialize_predictor(resume):
|
||||
set_torch_env()
|
||||
checkpoint = get_checkpoint() if not resume else resume
|
||||
predictor = Predictor(checkpoint, classes=load_classes(), rejection_class=CONFIG.estimator.rejection_class)
|
||||
return predictor
|
||||
@ -1,96 +0,0 @@
|
||||
from collections import namedtuple
|
||||
from itertools import starmap, combinations
|
||||
from operator import attrgetter, itemgetter
|
||||
|
||||
from frozendict import frozendict
|
||||
|
||||
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
|
||||
|
||||
|
||||
def make_box(x1, y1, x2, y2):
|
||||
keys = "x1", "y1", "x2", "y2"
|
||||
return dict(zip(keys, [x1, y1, x2, y2]))
|
||||
|
||||
|
||||
def compute_intersection(a, b):
|
||||
|
||||
a = Rectangle(*a.values())
|
||||
b = Rectangle(*b.values())
|
||||
|
||||
dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
|
||||
dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
|
||||
|
||||
return dx * dy if (dx >= 0) and (dy >= 0) else 0
|
||||
|
||||
|
||||
def compute_union(a, b):
|
||||
def area(box):
|
||||
r = Rectangle(*box.values())
|
||||
return (r.xmax - r.xmin) * (r.ymax - r.ymin)
|
||||
|
||||
return (area(a) + area(b)) - compute_intersection(a, b)
|
||||
|
||||
|
||||
def compute_iou(a, b):
|
||||
return compute_intersection(a, b) / compute_union(a, b)
|
||||
|
||||
|
||||
LPBox = namedtuple("LPBox", "label proba box")
|
||||
|
||||
|
||||
def less_likely(a, b):
|
||||
return min([a, b], key=attrgetter("proba"))
|
||||
|
||||
|
||||
def overlap_too_much(a, b, iou_thresh):
|
||||
iou = compute_iou(a.box, b.box)
|
||||
return iou > iou_thresh
|
||||
|
||||
|
||||
def __greedy_non_max_supprs(lpboxes, iou_thresh=0.1):
|
||||
def remove_less_likely(a, b):
|
||||
try:
|
||||
ll = less_likely(a, b)
|
||||
current_boxes.remove(ll)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
current_boxes = {*lpboxes}
|
||||
|
||||
while True:
|
||||
n = len(current_boxes)
|
||||
for a, b in combinations(current_boxes, r=2):
|
||||
if len({a, b} & current_boxes) != 2:
|
||||
continue
|
||||
if overlap_too_much(a, b, iou_thresh):
|
||||
remove_less_likely(a, b)
|
||||
|
||||
if n == len(current_boxes):
|
||||
break
|
||||
|
||||
return current_boxes
|
||||
|
||||
|
||||
def lpboxes_to_dict(lpboxes):
|
||||
|
||||
boxes = map(dict, map(attrgetter("box"), lpboxes))
|
||||
classes = map(attrgetter("label"), lpboxes)
|
||||
probas = map(attrgetter("proba"), lpboxes)
|
||||
|
||||
boxes, classes, probas = map(list, [boxes, classes, probas])
|
||||
|
||||
return {"bboxes": boxes, "classes": classes, "probas": probas}
|
||||
|
||||
|
||||
def greedy_non_max_supprs(predictions):
|
||||
|
||||
boxes, classes, probas = itemgetter("bboxes", "classes", "probas")(predictions)
|
||||
boxes = map(frozendict, boxes)
|
||||
lpboxes = list(starmap(LPBox, zip(classes, probas, boxes)))
|
||||
|
||||
lpboxes = __greedy_non_max_supprs(lpboxes)
|
||||
|
||||
merged_predictions = lpboxes_to_dict(lpboxes)
|
||||
predictions.update(merged_predictions)
|
||||
|
||||
return predictions
|
||||
@ -1,20 +0,0 @@
|
||||
from itertools import takewhile, starmap, islice, repeat
|
||||
from operator import truth
|
||||
|
||||
from pdf2image import pdf2image
|
||||
|
||||
|
||||
def chunk_iterable(iterable, n):
|
||||
return takewhile(truth, map(tuple, starmap(islice, repeat((iter(iterable), n)))))
|
||||
|
||||
|
||||
def get_page_count(pdf):
|
||||
return pdf2image.pdfinfo_from_bytes(pdf)["Pages"]
|
||||
|
||||
|
||||
def stream_pages(pdf):
|
||||
def page_to_image(idx):
|
||||
return pdf2image.convert_from_bytes(pdf, first_page=idx, last_page=idx + 1)[0]
|
||||
|
||||
page_count = get_page_count(pdf)
|
||||
return map(page_to_image, range(page_count))
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
from envyaml import EnvYAML
|
||||
|
||||
from fb_detr.locations import CONFIG_FILE
|
||||
from image_prediction.locations import CONFIG_FILE
|
||||
|
||||
|
||||
def _get_item_and_maybe_make_dotindexable(container, item):
|
||||
14
image_prediction/locations.py
Normal file
14
image_prediction/locations.py
Normal file
@ -0,0 +1,14 @@
|
||||
from os import path
|
||||
|
||||
MODULE_DIR = path.dirname(path.abspath(__file__))
|
||||
PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
|
||||
REPO_ROOT_DIR = path.dirname(path.dirname(PACKAGE_ROOT_DIR))
|
||||
|
||||
DOCKER_COMPOSE_FILE = path.join(REPO_ROOT_DIR, "docker-compose.yaml")
|
||||
|
||||
CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
|
||||
LOG_FILE = "/tmp/log.log"
|
||||
|
||||
DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
|
||||
MLRUNS_DIR = path.join(DATA_DIR, "mlruns")
|
||||
BASE_WEIGHTS = path.join(DATA_DIR, "base_weights.h5")
|
||||
116
image_prediction/predictor.py
Normal file
116
image_prediction/predictor.py
Normal file
@ -0,0 +1,116 @@
|
||||
import logging
|
||||
from itertools import chain
|
||||
from operator import itemgetter
|
||||
from typing import List, Dict, Iterable
|
||||
|
||||
import numpy as np
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.locations import MLRUNS_DIR, BASE_WEIGHTS
|
||||
from incl.redai_image.redai.redai.backend.model.model_handle import ModelHandle
|
||||
from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch
|
||||
from incl.redai_image.redai.redai.utils.mlflow_reader import MlflowModelReader
|
||||
from incl.redai_image.redai.redai.utils.shared import chunk_iterable
|
||||
|
||||
|
||||
class Predictor:
|
||||
"""`ModelHandle` wrapper. Forwards to wrapped model handle for prediction and produces structured output that is
|
||||
interpretable independently of the wrapped model (e.g. with regard to a .classes_ attribute).
|
||||
"""
|
||||
|
||||
def __init__(self, model_handle: ModelHandle = None):
|
||||
"""Initializes a ServiceEstimator.
|
||||
|
||||
Args:
|
||||
model_handle: ModelHandle object to forward to for prediction. By default, a model handle is loaded from the
|
||||
mlflow database via CONFIG.service.run_id.
|
||||
"""
|
||||
try:
|
||||
if model_handle is None:
|
||||
reader = MlflowModelReader(run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR)
|
||||
self.model_handle = reader.get_model_handle(BASE_WEIGHTS)
|
||||
else:
|
||||
self.model_handle = model_handle
|
||||
|
||||
self.classes = self.model_handle.model.classes_
|
||||
self.classes_readable = np.array(self.model_handle.classes)
|
||||
self.classes_readable_aligned = self.classes_readable[self.classes[list(range(len(self.classes)))]]
|
||||
except Exception as e:
|
||||
logging.info(f"Service estimator initialization failed: {e}")
|
||||
|
||||
def __make_predictions_human_readable(self, probs: np.ndarray) -> List[Dict[str, float]]:
|
||||
"""Translates an n x m matrix of probabilities over classes into an n-element list of mappings from classes to
|
||||
probabilities.
|
||||
|
||||
Args:
|
||||
probs: probability matrix (items x classes)
|
||||
|
||||
Returns:
|
||||
list of mappings from classes to probabilities.
|
||||
"""
|
||||
classes = np.argmax(probs, axis=1)
|
||||
classes = self.classes[classes]
|
||||
classes_readable = [self.model_handle.classes[c] for c in classes]
|
||||
return classes_readable
|
||||
|
||||
def predict(self, images: List, probabilities: bool = False, **kwargs):
|
||||
"""Gathers predictions for list of images. Assigns each image a class and optionally a probability distribution
|
||||
over all classes.
|
||||
|
||||
Args:
|
||||
images (List[PIL.Image]) : Images to gather predictions for.
|
||||
probabilities: Whether to return dictionaries of the following form instead of strings:
|
||||
{
|
||||
"class": predicted class,
|
||||
"probabilities": {
|
||||
"class 1" : class 1 probability,
|
||||
"class 2" : class 2 probability,
|
||||
...
|
||||
}
|
||||
}
|
||||
|
||||
Returns:
|
||||
By default the return value is a list of classes (meaningful class name strings). Alternatively a list of
|
||||
dictionaries with an additional probability field for estimated class probabilities per image can be
|
||||
returned.
|
||||
"""
|
||||
X = self.model_handle.prep_images(list(images))
|
||||
|
||||
probs_per_item = self.model_handle.model.predict_proba(X, **kwargs).astype(float)
|
||||
classes = self.__make_predictions_human_readable(probs_per_item)
|
||||
|
||||
class2prob_per_item = [dict(zip(self.classes_readable_aligned, probs)) for probs in probs_per_item]
|
||||
class2prob_per_item = [
|
||||
dict(sorted(c2p.items(), key=itemgetter(1), reverse=True)) for c2p in class2prob_per_item
|
||||
]
|
||||
|
||||
predictions = [{"class": c, "probabilities": c2p} for c, c2p in zip(classes, class2prob_per_item)]
|
||||
|
||||
return predictions if probabilities else classes
|
||||
|
||||
|
||||
def extract_image_metadata_pairs(pdf_path: str, **kwargs):
|
||||
def image_is_large_enough(metadata: dict):
|
||||
x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata)
|
||||
|
||||
return abs(x1 - x2) > 2 and abs(y1 - y2) > 2
|
||||
|
||||
yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs)
|
||||
|
||||
|
||||
def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size):
|
||||
def process_chunk(chunk):
|
||||
images, metadata = zip(*chunk)
|
||||
predictions = predictor.predict(images, probabilities=True)
|
||||
return predictions, metadata
|
||||
|
||||
def predict(image_metadata_pair_generator):
|
||||
chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size)
|
||||
return map(chain.from_iterable, zip(*map(process_chunk, chunks)))
|
||||
|
||||
try:
|
||||
predictions, metadata = predict(image_metadata_pairs)
|
||||
return predictions, metadata
|
||||
|
||||
except ValueError:
|
||||
return [], []
|
||||
71
image_prediction/response.py
Normal file
71
image_prediction/response.py
Normal file
@ -0,0 +1,71 @@
|
||||
"""Defines functions for constructing service responses."""
|
||||
|
||||
|
||||
from itertools import starmap
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
|
||||
|
||||
def build_response(predictions: list, metadata: list) -> list:
|
||||
return list(starmap(build_image_info, zip(predictions, metadata)))
|
||||
|
||||
|
||||
def build_image_info(prediction: dict, metadata: dict) -> dict:
|
||||
def compute_geometric_quotient():
|
||||
page_area_sqrt = np.sqrt(abs(page_width * page_height))
|
||||
image_area_sqrt = np.sqrt(abs(x2 - x1) * abs(y2 - y1))
|
||||
return image_area_sqrt / page_area_sqrt
|
||||
|
||||
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
|
||||
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
|
||||
)(metadata)
|
||||
|
||||
quotient = compute_geometric_quotient()
|
||||
|
||||
min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
|
||||
max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
|
||||
min_image_width_to_height_quotient_breached = bool(
|
||||
width / height < CONFIG.filters.image_width_to_height_quotient.min
|
||||
)
|
||||
max_image_width_to_height_quotient_breached = bool(
|
||||
width / height > CONFIG.filters.image_width_to_height_quotient.max
|
||||
)
|
||||
|
||||
min_confidence_breached = bool(max(prediction["probabilities"].values()) < CONFIG.filters.min_confidence)
|
||||
prediction["label"] = prediction.pop("class") # "class" as field name causes problem for Java objectmapper
|
||||
prediction["probabilities"] = {klass: np.round(prob, 6) for klass, prob in prediction["probabilities"].items()}
|
||||
|
||||
image_info = {
|
||||
"classification": prediction,
|
||||
"position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": metadata["page_idx"] + 1},
|
||||
"geometry": {"width": width, "height": height},
|
||||
"filters": {
|
||||
"geometry": {
|
||||
"imageSize": {
|
||||
"quotient": quotient,
|
||||
"tooLarge": max_image_to_page_quotient_breached,
|
||||
"tooSmall": min_image_to_page_quotient_breached,
|
||||
},
|
||||
"imageFormat": {
|
||||
"quotient": width / height,
|
||||
"tooTall": min_image_width_to_height_quotient_breached,
|
||||
"tooWide": max_image_width_to_height_quotient_breached,
|
||||
},
|
||||
},
|
||||
"probability": {"unconfident": min_confidence_breached},
|
||||
"allPassed": not any(
|
||||
[
|
||||
max_image_to_page_quotient_breached,
|
||||
min_image_to_page_quotient_breached,
|
||||
min_image_width_to_height_quotient_breached,
|
||||
max_image_width_to_height_quotient_breached,
|
||||
min_confidence_breached,
|
||||
]
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
return image_info
|
||||
@ -1 +0,0 @@
|
||||
Subproject commit 772023801e4fd3deef7953f7f49fd6fb2bf60236
|
||||
1
incl/redai_image
Submodule
1
incl/redai_image
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 4c3b26d7673457aaa99e0663dad6950cd36da967
|
||||
@ -1,10 +1,3 @@
|
||||
torch==1.10.2
|
||||
numpy==1.22.1
|
||||
opencv-python-headless==4.5.5.62
|
||||
torchvision==0.11.3
|
||||
pycocotools==2.0.4
|
||||
scipy==1.7.3
|
||||
pdf2image==1.16.0
|
||||
Flask==2.0.2
|
||||
requests==2.27.1
|
||||
iteration-utilities==0.11.0
|
||||
@ -12,5 +5,10 @@ dvc==2.9.3
|
||||
dvc[ssh]
|
||||
frozendict==2.3.0
|
||||
waitress==2.0.0
|
||||
envyaml==1.10.211231
|
||||
envyaml~=1.8.210417
|
||||
dependency-check==0.6.*
|
||||
envyaml~=1.8.210417
|
||||
mlflow~=1.20.2
|
||||
numpy~=1.19.3
|
||||
PDFNetPython3~=9.1.0
|
||||
tqdm~=4.62.2
|
||||
|
||||
@ -1,58 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
from operator import itemgetter
|
||||
|
||||
import pdf2image
|
||||
import requests
|
||||
from PIL import ImageDraw, ImageFont
|
||||
|
||||
|
||||
def draw_coco_box(draw: ImageDraw.Draw, bbox, klass, proba):
|
||||
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox)
|
||||
draw.rectangle(((x1, y1), (x2, y2)), outline="red")
|
||||
|
||||
fnt = ImageFont.truetype("Pillow/Tests/fonts/FreeMono.ttf", 30)
|
||||
|
||||
draw.text((x1, y2), text=f"{klass}: {proba:.2f}", fill=(0, 0, 0, 100), font=fnt)
|
||||
|
||||
|
||||
def draw_coco_boxes(image, bboxes, classes, probas):
|
||||
|
||||
draw = ImageDraw.Draw(image)
|
||||
for bbox, klass, proba in zip(bboxes, classes, probas):
|
||||
draw_coco_box(draw, bbox, klass, proba)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def annotate(pdf_path, predictions):
|
||||
pages = pdf2image.convert_from_path(pdf_path)
|
||||
|
||||
for prd in predictions:
|
||||
page_idx, boxes, classes, probas = itemgetter("page_idx", "bboxes", "classes", "probas")(prd)
|
||||
page = pages[page_idx]
|
||||
image = draw_coco_boxes(page, boxes, classes, probas)
|
||||
image.save(f"/tmp/serv_out/{page_idx}.png")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--pdf_path", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))
|
||||
response.raise_for_status()
|
||||
predictions = response.json()
|
||||
|
||||
print(json.dumps(predictions, indent=2))
|
||||
annotate(args.pdf_path, predictions)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -1,35 +0,0 @@
|
||||
import argparse
|
||||
|
||||
from PIL import Image
|
||||
from flask import Flask, request, jsonify
|
||||
from pathlib import Path
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.before_first_request
|
||||
def init():
|
||||
from fb_detr.predictor import Predictor
|
||||
|
||||
global PRED
|
||||
|
||||
PRED = Predictor(args.resume)
|
||||
|
||||
|
||||
@app.route("/", methods=["GET", "POST"])
|
||||
def predict_request():
|
||||
if request.method == "POST":
|
||||
image_folder_path = request.form.get("image_folder_path")
|
||||
images = list(map(Image.open, Path(image_folder_path).glob("*.png")))
|
||||
results = PRED.predict(images, format_output=True)
|
||||
for result in results:
|
||||
return jsonify(result)
|
||||
if request.method == "GET":
|
||||
return "Not implemented"
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--resume", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
app.run()
|
||||
26
scripts/pyinfra_mock.py
Normal file
26
scripts/pyinfra_mock.py
Normal file
@ -0,0 +1,26 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--pdf_path", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))
|
||||
response.raise_for_status()
|
||||
predictions = response.json()
|
||||
|
||||
print(json.dumps(predictions, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
4
setup.py
4
setup.py
@ -3,11 +3,11 @@
|
||||
from distutils.core import setup
|
||||
|
||||
setup(
|
||||
name="fb_detr",
|
||||
name="image_prediction",
|
||||
version="0.1.0",
|
||||
description="",
|
||||
author="",
|
||||
author_email="",
|
||||
url="",
|
||||
packages=["fb_detr"],
|
||||
packages=["image_prediction"],
|
||||
)
|
||||
|
||||
@ -11,5 +11,5 @@ dvc pull
|
||||
|
||||
git submodule update --init --recursive
|
||||
|
||||
docker build -f Dockerfile_base -t fb-detr-base .
|
||||
docker build -f Dockerfile -t fb-detr .
|
||||
docker build -f Dockerfile_base -t image-prediction-base .
|
||||
docker build -f Dockerfile -t image-prediction .
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
sonar.exclusions=bamboo-specs/**, **/test_data/**, **/detr/**
|
||||
sonar.exclusions=bamboo-specs/**, **/test_data/**
|
||||
sonar.c.file.suffixes=-
|
||||
sonar.cpp.file.suffixes=-
|
||||
sonar.objc.file.suffixes=-
|
||||
|
||||
76
src/serve.py
76
src/serve.py
@ -1,37 +1,19 @@
|
||||
import argparse
|
||||
import logging
|
||||
from typing import Callable
|
||||
import tempfile
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
from waitress import serve
|
||||
|
||||
from fb_detr.config import CONFIG
|
||||
from fb_detr.utils.estimator import suppress_userwarnings, initialize_predictor
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.predictor import Predictor, extract_image_metadata_pairs, classify_images
|
||||
from image_prediction.response import build_response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--resume")
|
||||
parser.add_argument("--warnings", action="store_true", default=False)
|
||||
args = parser.parse_args()
|
||||
def main():
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
if not args.warnings:
|
||||
suppress_userwarnings()
|
||||
|
||||
predictor = initialize_predictor(args.resume)
|
||||
predictor = Predictor()
|
||||
logging.info("Predictor ready.")
|
||||
|
||||
prediction_server = make_prediction_server(predictor.predict_pdf)
|
||||
|
||||
run_prediction_server(prediction_server, mode=CONFIG.webserver.mode)
|
||||
|
||||
|
||||
def make_prediction_server(predict_fn: Callable):
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/ready", methods=["GET"])
|
||||
@ -48,38 +30,31 @@ def make_prediction_server(predict_fn: Callable):
|
||||
|
||||
@app.route("/", methods=["POST"])
|
||||
def predict():
|
||||
def __predict():
|
||||
|
||||
def inner():
|
||||
|
||||
pdf = request.data
|
||||
|
||||
logging.debug("Running predictor on document...")
|
||||
predictions = predict_fn(pdf)
|
||||
logging.debug(f"Found {len(predictions)} images in document.")
|
||||
response = jsonify(list(predictions))
|
||||
pdf = request.data
|
||||
|
||||
logging.debug("Running predictor on document...")
|
||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
||||
tmp_file.write(pdf)
|
||||
image_metadata_pairs = extract_image_metadata_pairs(tmp_file.name)
|
||||
try:
|
||||
predictions, metadata = classify_images(predictor, image_metadata_pairs)
|
||||
except Exception as err:
|
||||
logging.warning("Analysis failed.")
|
||||
logging.exception(err)
|
||||
response = jsonify("Analysis failed.")
|
||||
response.status_code = 500
|
||||
return response
|
||||
logging.debug(f"Found images in document.")
|
||||
|
||||
logging.info(f"Analyzing...")
|
||||
result = inner()
|
||||
logging.info("Analysis completed.")
|
||||
return result
|
||||
response = jsonify(build_response(list(predictions), list(metadata)))
|
||||
|
||||
try:
|
||||
return __predict()
|
||||
except Exception as err:
|
||||
logging.warning("Analysis failed.")
|
||||
logging.exception(err)
|
||||
response = jsonify("Analysis failed.")
|
||||
response.status_code = 500
|
||||
return response
|
||||
logging.info("Analysis completed.")
|
||||
return response
|
||||
|
||||
return app
|
||||
run_prediction_server(app, mode=CONFIG.webserver.mode)
|
||||
|
||||
|
||||
def run_prediction_server(app, mode="development"):
|
||||
|
||||
if mode == "development":
|
||||
app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True)
|
||||
elif mode == "production":
|
||||
@ -87,7 +62,6 @@ def run_prediction_server(app, mode="development"):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logging_level = CONFIG.service.logging_level
|
||||
logging.basicConfig(level=logging_level)
|
||||
logging.getLogger("flask").setLevel(logging.ERROR)
|
||||
@ -95,6 +69,4 @@ if __name__ == "__main__":
|
||||
logging.getLogger("werkzeug").setLevel(logging.ERROR)
|
||||
logging.getLogger("waitress").setLevel(logging.ERROR)
|
||||
|
||||
args = parse_args()
|
||||
|
||||
main(args)
|
||||
main()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user