Julius Unverfehrt 520eee26e3 Pull request #22: Integrate image extraction new pyinfra
Merge in RR/image-prediction from integrate-image-extraction-new-pyinfra to master

Squashed commit of the following:

commit 8470c065c71ea2a985aadfc399fb32c693e3a90f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Aug 18 09:19:52 2022 +0200

    add key script

commit 8f6eb1e79083fb32fb7bedac640c10b6fd411899
Merge: 27fd7de c1b9629
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Aug 18 09:17:50 2022 +0200

    Merge branch 'master' of ssh://git.iqser.com:2222/rr/image-prediction into integrate-image-extraction-new-pyinfra

commit 27fd7de39a59d0d88fbddb471dd7797b61223ece
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 17 13:15:09 2022 +0200

    update pyinfra

commit ca58f85642598dc15e286074982e7cedae9a1355
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 16 16:16:10 2022 +0200

    update pdf2image-service

commit f43795cee0e211e14ac5f9296b01d440ae759c55
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Aug 15 10:32:02 2022 +0200

    update pipeline script to also work with figure detection metadata

commit 2b2da1b60ce56fb006cf2f6b65aeda9774391b2a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Aug 12 13:37:48 2022 +0200

    add new pyinfra, add optional image classifcation under key dataCV if figure metadata is present on storage

commit bae25bedbd3a262a9d00e18a1b19f4ee6f1eb924
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 10 13:27:41 2022 +0200

    tidy-up

commit 287b0ebc8a952e506185d13508eaa386d0420704
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 10 12:57:35 2022 +0200

    update server logic for new pyinfra, add extraction from scanned PDF with figure detection logic

commit 3225cefaa25e4559b105397bc06c867a22806ba8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 10 10:37:31 2022 +0200

    integrate new pyinfra logic

commit 46926078342b0680a7416560bb69bec037cf8038
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 3 13:15:27 2022 +0200

    add image extraction for scanned PDFs WIP

commit 1b3b11b6f9044d44cb9a822a78197a2ebc6f306a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 3 09:41:06 2022 +0200

    add pyinfra and pdf2image as git submodule
2022-08-18 09:20:48 +02:00

73 lines
2.4 KiB
Python

import os
from functools import partial
from itertools import chain, tee
from typing import Iterable
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from tqdm import tqdm
from image_prediction.config import CONFIG
from image_prediction.default_objects import (
get_formatter,
get_mlflow_model_loader,
get_image_classifier,
get_encoder,
get_dispatched_extract,
)
from image_prediction.locations import MLRUNS_DIR
from image_prediction.utils.generic import lift, starlift
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
def load_pipeline(**kwargs):
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
model_identifier = CONFIG.service.mlflow_run_id
pipeline = Pipeline(model_loader, model_identifier, **kwargs)
return pipeline
def parallel(*fs):
return lambda *args: (f(a) for f, a in zip(fs, args))
def star(f):
return lambda x: f(*x)
class Pipeline:
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
self.verbose = verbose
extract = get_dispatched_extract(**kwargs)
classifier = get_image_classifier(model_loader, model_identifier)
reformat = get_formatter()
represent = get_encoder()
split = compose(star(parallel(*map(lift, (first, first, second)))), rpartial(tee, 3))
classify = compose(chain.from_iterable, lift(classifier), partial(chunks, batch_size))
pairwise_apply = compose(star, parallel)
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat
# \>--identity--/
self.pipe = rcompose(
extract, # ... image-metadata-pairs as a stream
split, # ... into an image stream and a metadata stream
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping
reformat, # ... the items
)
def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
yield from tqdm(
self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image),
desc="Processing images from document",
unit=" images",
disable=not self.verbose,
)