Merge in RR/image-prediction from integrate-image-extraction-new-pyinfra to master
Squashed commit of the following:
commit 8470c065c71ea2a985aadfc399fb32c693e3a90f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Aug 18 09:19:52 2022 +0200
add key script
commit 8f6eb1e79083fb32fb7bedac640c10b6fd411899
Merge: 27fd7de c1b9629
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Aug 18 09:17:50 2022 +0200
Merge branch 'master' of ssh://git.iqser.com:2222/rr/image-prediction into integrate-image-extraction-new-pyinfra
commit 27fd7de39a59d0d88fbddb471dd7797b61223ece
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 17 13:15:09 2022 +0200
update pyinfra
commit ca58f85642598dc15e286074982e7cedae9a1355
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 16 16:16:10 2022 +0200
update pdf2image-service
commit f43795cee0e211e14ac5f9296b01d440ae759c55
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Aug 15 10:32:02 2022 +0200
update pipeline script to also work with figure detection metadata
commit 2b2da1b60ce56fb006cf2f6b65aeda9774391b2a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Aug 12 13:37:48 2022 +0200
add new pyinfra, add optional image classifcation under key dataCV if figure metadata is present on storage
commit bae25bedbd3a262a9d00e18a1b19f4ee6f1eb924
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 13:27:41 2022 +0200
tidy-up
commit 287b0ebc8a952e506185d13508eaa386d0420704
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 12:57:35 2022 +0200
update server logic for new pyinfra, add extraction from scanned PDF with figure detection logic
commit 3225cefaa25e4559b105397bc06c867a22806ba8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 10:37:31 2022 +0200
integrate new pyinfra logic
commit 46926078342b0680a7416560bb69bec037cf8038
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 3 13:15:27 2022 +0200
add image extraction for scanned PDFs WIP
commit 1b3b11b6f9044d44cb9a822a78197a2ebc6f306a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 3 09:41:06 2022 +0200
add pyinfra and pdf2image as git submodule
55 lines
2.3 KiB
Python
55 lines
2.3 KiB
Python
from typing import Iterable
|
|
|
|
from funcy import juxt
|
|
|
|
from image_prediction.classifier.classifier import Classifier
|
|
from image_prediction.classifier.image_classifier import ImageClassifier
|
|
from image_prediction.compositor.compositor import TransformerCompositor
|
|
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
|
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
|
|
from image_prediction.formatter.formatter import format_image_plus
|
|
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
|
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
|
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
|
from image_prediction.label_mapper.mappers.probability import ProbabilityMapper
|
|
from image_prediction.model_loader.loader import ModelLoader
|
|
from image_prediction.model_loader.loaders.mlflow import MlflowConnector
|
|
from image_prediction.redai_adapter.mlflow import MlflowModelReader
|
|
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
|
|
from image_prediction.transformer.transformers.response import ResponseTransformer
|
|
from pdf2img.extraction import extract_images_via_metadata
|
|
|
|
|
|
def get_mlflow_model_loader(mlruns_dir):
|
|
model_loader = ModelLoader(MlflowConnector(MlflowModelReader(mlruns_dir)))
|
|
return model_loader
|
|
|
|
|
|
def get_image_classifier(model_loader, model_identifier):
|
|
model, classes = juxt(model_loader.load_model, model_loader.load_classes)(model_identifier)
|
|
return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
|
|
|
|
|
|
def get_dispatched_extract(**kwargs):
|
|
image_extractor = ParsablePDFImageExtractor(**kwargs)
|
|
|
|
def extract(pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
|
|
if metadata_per_image:
|
|
image_pluses = extract_images_via_metadata(pdf, metadata_per_image)
|
|
yield from map(format_image_plus, image_pluses)
|
|
else:
|
|
yield from image_extractor.extract(pdf, page_range)
|
|
|
|
return extract
|
|
|
|
|
|
def get_formatter():
|
|
formatter = TransformerCompositor(
|
|
PDFNetCoordinateTransformer(), EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter()
|
|
)
|
|
return formatter
|
|
|
|
|
|
def get_encoder():
|
|
return HashEncoder()
|