Merge in RR/image-prediction from integrate-image-extraction-new-pyinfra to master
Squashed commit of the following:
commit 8470c065c71ea2a985aadfc399fb32c693e3a90f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Aug 18 09:19:52 2022 +0200
add key script
commit 8f6eb1e79083fb32fb7bedac640c10b6fd411899
Merge: 27fd7de c1b9629
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Aug 18 09:17:50 2022 +0200
Merge branch 'master' of ssh://git.iqser.com:2222/rr/image-prediction into integrate-image-extraction-new-pyinfra
commit 27fd7de39a59d0d88fbddb471dd7797b61223ece
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 17 13:15:09 2022 +0200
update pyinfra
commit ca58f85642598dc15e286074982e7cedae9a1355
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 16 16:16:10 2022 +0200
update pdf2image-service
commit f43795cee0e211e14ac5f9296b01d440ae759c55
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Aug 15 10:32:02 2022 +0200
update pipeline script to also work with figure detection metadata
commit 2b2da1b60ce56fb006cf2f6b65aeda9774391b2a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Aug 12 13:37:48 2022 +0200
add new pyinfra, add optional image classifcation under key dataCV if figure metadata is present on storage
commit bae25bedbd3a262a9d00e18a1b19f4ee6f1eb924
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 13:27:41 2022 +0200
tidy-up
commit 287b0ebc8a952e506185d13508eaa386d0420704
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 12:57:35 2022 +0200
update server logic for new pyinfra, add extraction from scanned PDF with figure detection logic
commit 3225cefaa25e4559b105397bc06c867a22806ba8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 10:37:31 2022 +0200
integrate new pyinfra logic
commit 46926078342b0680a7416560bb69bec037cf8038
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 3 13:15:27 2022 +0200
add image extraction for scanned PDFs WIP
commit 1b3b11b6f9044d44cb9a822a78197a2ebc6f306a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 3 09:41:06 2022 +0200
add pyinfra and pdf2image as git submodule
73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
import os
|
|
from functools import partial
|
|
from itertools import chain, tee
|
|
from typing import Iterable
|
|
|
|
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
|
from tqdm import tqdm
|
|
|
|
from image_prediction.config import CONFIG
|
|
from image_prediction.default_objects import (
|
|
get_formatter,
|
|
get_mlflow_model_loader,
|
|
get_image_classifier,
|
|
get_encoder,
|
|
get_dispatched_extract,
|
|
)
|
|
from image_prediction.locations import MLRUNS_DIR
|
|
from image_prediction.utils.generic import lift, starlift
|
|
|
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
|
|
|
|
def load_pipeline(**kwargs):
|
|
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
|
|
model_identifier = CONFIG.service.mlflow_run_id
|
|
|
|
pipeline = Pipeline(model_loader, model_identifier, **kwargs)
|
|
|
|
return pipeline
|
|
|
|
|
|
def parallel(*fs):
|
|
return lambda *args: (f(a) for f, a in zip(fs, args))
|
|
|
|
|
|
def star(f):
|
|
return lambda x: f(*x)
|
|
|
|
|
|
class Pipeline:
|
|
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
|
|
self.verbose = verbose
|
|
|
|
extract = get_dispatched_extract(**kwargs)
|
|
classifier = get_image_classifier(model_loader, model_identifier)
|
|
reformat = get_formatter()
|
|
represent = get_encoder()
|
|
|
|
split = compose(star(parallel(*map(lift, (first, first, second)))), rpartial(tee, 3))
|
|
classify = compose(chain.from_iterable, lift(classifier), partial(chunks, batch_size))
|
|
pairwise_apply = compose(star, parallel)
|
|
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
|
|
|
# />--classify--\
|
|
# --extract-->--split--+->--encode---->+--join-->reformat
|
|
# \>--identity--/
|
|
|
|
self.pipe = rcompose(
|
|
extract, # ... image-metadata-pairs as a stream
|
|
split, # ... into an image stream and a metadata stream
|
|
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
|
join, # ... the streams by zipping
|
|
reformat, # ... the items
|
|
)
|
|
|
|
def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
|
|
yield from tqdm(
|
|
self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image),
|
|
desc="Processing images from document",
|
|
unit=" images",
|
|
disable=not self.verbose,
|
|
)
|