revert adhoc figure detection changes
- revert pipeline and serve logic to pre figure detection data for image extraction changes: figure detection data as input not supported for now
This commit is contained in:
parent
25fc7d84b9
commit
98dc001123
@ -1,5 +1,3 @@
|
||||
from typing import Iterable
|
||||
|
||||
from funcy import juxt
|
||||
|
||||
from image_prediction.classifier.classifier import Classifier
|
||||
@ -7,7 +5,6 @@ from image_prediction.classifier.image_classifier import ImageClassifier
|
||||
from image_prediction.compositor.compositor import TransformerCompositor
|
||||
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
||||
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
|
||||
from image_prediction.formatter.formatter import format_image_plus
|
||||
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
||||
@ -17,7 +14,6 @@ from image_prediction.model_loader.loaders.mlflow import MlflowConnector
|
||||
from image_prediction.redai_adapter.mlflow import MlflowModelReader
|
||||
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
|
||||
from image_prediction.transformer.transformers.response import ResponseTransformer
|
||||
from pdf2img.extraction import extract_images_via_metadata
|
||||
|
||||
|
||||
def get_mlflow_model_loader(mlruns_dir):
|
||||
@ -30,17 +26,10 @@ def get_image_classifier(model_loader, model_identifier):
|
||||
return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
|
||||
|
||||
|
||||
def get_dispatched_extract(**kwargs):
|
||||
def get_extractor(**kwargs):
|
||||
image_extractor = ParsablePDFImageExtractor(**kwargs)
|
||||
|
||||
def extract(pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
|
||||
if metadata_per_image:
|
||||
image_pluses = extract_images_via_metadata(pdf, metadata_per_image)
|
||||
yield from map(format_image_plus, image_pluses)
|
||||
else:
|
||||
yield from image_extractor.extract(pdf, page_range)
|
||||
|
||||
return extract
|
||||
return image_extractor
|
||||
|
||||
|
||||
def get_formatter():
|
||||
|
||||
@ -1,10 +1,6 @@
|
||||
import abc
|
||||
|
||||
from image_prediction.image_extractor.extractor import ImageMetadataPair
|
||||
from image_prediction.info import Info
|
||||
|
||||
from image_prediction.transformer.transformer import Transformer
|
||||
from pdf2img.default_objects.image import ImagePlus
|
||||
|
||||
|
||||
class Formatter(Transformer):
|
||||
@ -17,19 +13,3 @@ class Formatter(Transformer):
|
||||
|
||||
def __call__(self, obj):
|
||||
return self.format(obj)
|
||||
|
||||
|
||||
def format_image_plus(image: ImagePlus) -> ImageMetadataPair:
|
||||
enum_metadata = {
|
||||
Info.PAGE_WIDTH: image.info.pageInfo.width,
|
||||
Info.PAGE_HEIGHT: image.info.pageInfo.height,
|
||||
Info.PAGE_IDX: image.info.pageInfo.number,
|
||||
Info.ALPHA: image.info.alpha,
|
||||
Info.WIDTH: image.info.boundingBox.width,
|
||||
Info.HEIGHT: image.info.boundingBox.height,
|
||||
Info.X1: image.info.boundingBox.x0,
|
||||
Info.X2: image.info.boundingBox.x1,
|
||||
Info.Y1: image.info.boundingBox.y0,
|
||||
Info.Y2: image.info.boundingBox.y1,
|
||||
}
|
||||
return ImageMetadataPair(image.aspil(), enum_metadata)
|
||||
|
||||
@ -11,8 +11,8 @@ from image_prediction.default_objects import (
|
||||
get_formatter,
|
||||
get_mlflow_model_loader,
|
||||
get_image_classifier,
|
||||
get_extractor,
|
||||
get_encoder,
|
||||
get_dispatched_extract,
|
||||
)
|
||||
from image_prediction.locations import MLRUNS_DIR
|
||||
from image_prediction.utils.generic import lift, starlift
|
||||
@ -41,7 +41,7 @@ class Pipeline:
|
||||
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
|
||||
self.verbose = verbose
|
||||
|
||||
extract = get_dispatched_extract(**kwargs)
|
||||
extract = get_extractor(**kwargs)
|
||||
classifier = get_image_classifier(model_loader, model_identifier)
|
||||
reformat = get_formatter()
|
||||
represent = get_encoder()
|
||||
@ -63,9 +63,9 @@ class Pipeline:
|
||||
reformat, # ... the items
|
||||
)
|
||||
|
||||
def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
|
||||
def __call__(self, pdf: bytes, page_range: range = None):
|
||||
yield from tqdm(
|
||||
self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image),
|
||||
self.pipe(pdf, page_range=page_range),
|
||||
desc="Processing images from document",
|
||||
unit=" images",
|
||||
disable=not self.verbose,
|
||||
|
||||
@ -2,7 +2,6 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
from glob import glob
|
||||
from operator import truth
|
||||
|
||||
from image_prediction.pipeline import load_pipeline
|
||||
from image_prediction.utils import get_logger
|
||||
@ -15,7 +14,6 @@ def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("input", help="pdf file or directory")
|
||||
parser.add_argument("--metadata", help="optional figure detection metadata")
|
||||
parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False)
|
||||
parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int)
|
||||
|
||||
@ -24,17 +22,13 @@ def parse_args():
|
||||
return args
|
||||
|
||||
|
||||
def process_pdf(pipeline, pdf_path, metadata=None, page_range=None):
|
||||
if metadata:
|
||||
with open(metadata) as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
def process_pdf(pipeline, pdf_path, page_range=None):
|
||||
with open(pdf_path, "rb") as f:
|
||||
logger.info(f"Processing {pdf_path}")
|
||||
predictions = list(pipeline(f.read(), page_range=page_range, metadata_per_image=metadata))
|
||||
predictions = list(pipeline(f.read(), page_range=page_range))
|
||||
|
||||
annotate_pdf(
|
||||
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", f"_{truth(metadata)}_annotated.pdf")))
|
||||
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
|
||||
)
|
||||
|
||||
return predictions
|
||||
@ -48,10 +42,9 @@ def main(args):
|
||||
else:
|
||||
pdf_paths = glob(os.path.join(args.input, "*.pdf"))
|
||||
page_range = range(*args.page_interval) if args.page_interval else None
|
||||
metadata = args.metadata if args.metadata else None
|
||||
|
||||
for pdf_path in pdf_paths:
|
||||
predictions = process_pdf(pipeline, pdf_path, metadata, page_range=page_range)
|
||||
predictions = process_pdf(pipeline, pdf_path, page_range=page_range)
|
||||
if args.print:
|
||||
print(pdf_path)
|
||||
print(json.dumps(predictions, indent=2))
|
||||
|
||||
18
src/serve.py
18
src/serve.py
@ -1,5 +1,4 @@
|
||||
import gzip
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
|
||||
@ -31,32 +30,23 @@ def process_request(request_message):
|
||||
file_id = request_message["fileId"]
|
||||
target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}"
|
||||
response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}"
|
||||
figure_data_file_name = f"{dossier_id}/{file_id}.FIGURE.json.gz"
|
||||
|
||||
bucket = PYINFRA_CONFIG.storage_bucket
|
||||
storage = get_storage(PYINFRA_CONFIG)
|
||||
|
||||
pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size)
|
||||
|
||||
if storage.exists(bucket, target_file_name):
|
||||
if not storage.exists(bucket, target_file_name):
|
||||
should_publish_result = False
|
||||
else:
|
||||
should_publish_result = True
|
||||
object_bytes = storage.get_object(bucket, target_file_name)
|
||||
object_bytes = gzip.decompress(object_bytes)
|
||||
classifications = list(pipeline(pdf=object_bytes))
|
||||
|
||||
if storage.exists(bucket, figure_data_file_name):
|
||||
metadata_bytes = storage.get_object(bucket, figure_data_file_name)
|
||||
metadata_bytes = gzip.decompress(metadata_bytes)
|
||||
metadata_per_image = json.load(io.BytesIO(metadata_bytes))["data"]
|
||||
classifications_cv = list(pipeline(pdf=object_bytes, metadata_per_image=metadata_per_image))
|
||||
else:
|
||||
classifications_cv = []
|
||||
|
||||
result = {**request_message, "data": classifications, "dataCV": classifications_cv}
|
||||
result = {**request_message, "data": classifications}
|
||||
storage_bytes = gzip.compress(json.dumps(result).encode("utf-8"))
|
||||
storage.put_object(bucket, response_file_name, storage_bytes)
|
||||
else:
|
||||
should_publish_result = False
|
||||
|
||||
return should_publish_result, {"dossierId": dossier_id, "fileId": file_id}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user