From 98dc001123b410bd91269bb184ce6e37bc47f1b7 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Mon, 30 Jan 2023 12:41:12 +0100 Subject: [PATCH] revert adhoc figure detection changes - revert pipeline and serve logic to pre figure detection data for image extraction changes: figure detection data as input not supported for now --- image_prediction/default_objects.py | 15 ++------------- image_prediction/formatter/formatter.py | 20 -------------------- image_prediction/pipeline.py | 8 ++++---- scripts/run_pipeline.py | 15 ++++----------- src/serve.py | 18 ++++-------------- 5 files changed, 14 insertions(+), 62 deletions(-) diff --git a/image_prediction/default_objects.py b/image_prediction/default_objects.py index d66d477..1c40d56 100644 --- a/image_prediction/default_objects.py +++ b/image_prediction/default_objects.py @@ -1,5 +1,3 @@ -from typing import Iterable - from funcy import juxt from image_prediction.classifier.classifier import Classifier @@ -7,7 +5,6 @@ from image_prediction.classifier.image_classifier import ImageClassifier from image_prediction.compositor.compositor import TransformerCompositor from image_prediction.encoder.encoders.hash_encoder import HashEncoder from image_prediction.estimator.adapter.adapter import EstimatorAdapter -from image_prediction.formatter.formatter import format_image_plus from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor @@ -17,7 +14,6 @@ from image_prediction.model_loader.loaders.mlflow import MlflowConnector from image_prediction.redai_adapter.mlflow import MlflowModelReader from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer from image_prediction.transformer.transformers.response import ResponseTransformer -from pdf2img.extraction import extract_images_via_metadata def get_mlflow_model_loader(mlruns_dir): @@ -30,17 +26,10 @@ def get_image_classifier(model_loader, model_identifier): return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes))) -def get_dispatched_extract(**kwargs): +def get_extractor(**kwargs): image_extractor = ParsablePDFImageExtractor(**kwargs) - def extract(pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None): - if metadata_per_image: - image_pluses = extract_images_via_metadata(pdf, metadata_per_image) - yield from map(format_image_plus, image_pluses) - else: - yield from image_extractor.extract(pdf, page_range) - - return extract + return image_extractor def get_formatter(): diff --git a/image_prediction/formatter/formatter.py b/image_prediction/formatter/formatter.py index 53306a9..3f3a1f8 100644 --- a/image_prediction/formatter/formatter.py +++ b/image_prediction/formatter/formatter.py @@ -1,10 +1,6 @@ import abc -from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.info import Info - from image_prediction.transformer.transformer import Transformer -from pdf2img.default_objects.image import ImagePlus class Formatter(Transformer): @@ -17,19 +13,3 @@ class Formatter(Transformer): def __call__(self, obj): return self.format(obj) - - -def format_image_plus(image: ImagePlus) -> ImageMetadataPair: - enum_metadata = { - Info.PAGE_WIDTH: image.info.pageInfo.width, - Info.PAGE_HEIGHT: image.info.pageInfo.height, - Info.PAGE_IDX: image.info.pageInfo.number, - Info.ALPHA: image.info.alpha, - Info.WIDTH: image.info.boundingBox.width, - Info.HEIGHT: image.info.boundingBox.height, - Info.X1: image.info.boundingBox.x0, - Info.X2: image.info.boundingBox.x1, - Info.Y1: image.info.boundingBox.y0, - Info.Y2: image.info.boundingBox.y1, - } - return ImageMetadataPair(image.aspil(), enum_metadata) diff --git a/image_prediction/pipeline.py b/image_prediction/pipeline.py index f9383a1..704a88f 100644 --- a/image_prediction/pipeline.py +++ b/image_prediction/pipeline.py @@ -11,8 +11,8 @@ from image_prediction.default_objects import ( get_formatter, get_mlflow_model_loader, get_image_classifier, + get_extractor, get_encoder, - get_dispatched_extract, ) from image_prediction.locations import MLRUNS_DIR from image_prediction.utils.generic import lift, starlift @@ -41,7 +41,7 @@ class Pipeline: def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs): self.verbose = verbose - extract = get_dispatched_extract(**kwargs) + extract = get_extractor(**kwargs) classifier = get_image_classifier(model_loader, model_identifier) reformat = get_formatter() represent = get_encoder() @@ -63,9 +63,9 @@ class Pipeline: reformat, # ... the items ) - def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None): + def __call__(self, pdf: bytes, page_range: range = None): yield from tqdm( - self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image), + self.pipe(pdf, page_range=page_range), desc="Processing images from document", unit=" images", disable=not self.verbose, diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 29d3199..c2b4bb0 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -2,7 +2,6 @@ import argparse import json import os from glob import glob -from operator import truth from image_prediction.pipeline import load_pipeline from image_prediction.utils import get_logger @@ -15,7 +14,6 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", help="pdf file or directory") - parser.add_argument("--metadata", help="optional figure detection metadata") parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False) parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int) @@ -24,17 +22,13 @@ def parse_args(): return args -def process_pdf(pipeline, pdf_path, metadata=None, page_range=None): - if metadata: - with open(metadata) as f: - metadata = json.load(f) - +def process_pdf(pipeline, pdf_path, page_range=None): with open(pdf_path, "rb") as f: logger.info(f"Processing {pdf_path}") - predictions = list(pipeline(f.read(), page_range=page_range, metadata_per_image=metadata)) + predictions = list(pipeline(f.read(), page_range=page_range)) annotate_pdf( - pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", f"_{truth(metadata)}_annotated.pdf"))) + pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf"))) ) return predictions @@ -48,10 +42,9 @@ def main(args): else: pdf_paths = glob(os.path.join(args.input, "*.pdf")) page_range = range(*args.page_interval) if args.page_interval else None - metadata = args.metadata if args.metadata else None for pdf_path in pdf_paths: - predictions = process_pdf(pipeline, pdf_path, metadata, page_range=page_range) + predictions = process_pdf(pipeline, pdf_path, page_range=page_range) if args.print: print(pdf_path) print(json.dumps(predictions, indent=2)) diff --git a/src/serve.py b/src/serve.py index ece6a0b..4960563 100644 --- a/src/serve.py +++ b/src/serve.py @@ -1,5 +1,4 @@ import gzip -import io import json import logging @@ -31,32 +30,23 @@ def process_request(request_message): file_id = request_message["fileId"] target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}" response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}" - figure_data_file_name = f"{dossier_id}/{file_id}.FIGURE.json.gz" bucket = PYINFRA_CONFIG.storage_bucket storage = get_storage(PYINFRA_CONFIG) pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) - if storage.exists(bucket, target_file_name): + if not storage.exists(bucket, target_file_name): + should_publish_result = False + else: should_publish_result = True object_bytes = storage.get_object(bucket, target_file_name) object_bytes = gzip.decompress(object_bytes) classifications = list(pipeline(pdf=object_bytes)) - if storage.exists(bucket, figure_data_file_name): - metadata_bytes = storage.get_object(bucket, figure_data_file_name) - metadata_bytes = gzip.decompress(metadata_bytes) - metadata_per_image = json.load(io.BytesIO(metadata_bytes))["data"] - classifications_cv = list(pipeline(pdf=object_bytes, metadata_per_image=metadata_per_image)) - else: - classifications_cv = [] - - result = {**request_message, "data": classifications, "dataCV": classifications_cv} + result = {**request_message, "data": classifications} storage_bytes = gzip.compress(json.dumps(result).encode("utf-8")) storage.put_object(bucket, response_file_name, storage_bytes) - else: - should_publish_result = False return should_publish_result, {"dossierId": dossier_id, "fileId": file_id}