From 98dc001123b410bd91269bb184ce6e37bc47f1b7 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Mon, 30 Jan 2023 12:41:12 +0100 Subject: [PATCH 01/25] revert adhoc figure detection changes - revert pipeline and serve logic to pre figure detection data for image extraction changes: figure detection data as input not supported for now --- image_prediction/default_objects.py | 15 ++------------- image_prediction/formatter/formatter.py | 20 -------------------- image_prediction/pipeline.py | 8 ++++---- scripts/run_pipeline.py | 15 ++++----------- src/serve.py | 18 ++++-------------- 5 files changed, 14 insertions(+), 62 deletions(-) diff --git a/image_prediction/default_objects.py b/image_prediction/default_objects.py index d66d477..1c40d56 100644 --- a/image_prediction/default_objects.py +++ b/image_prediction/default_objects.py @@ -1,5 +1,3 @@ -from typing import Iterable - from funcy import juxt from image_prediction.classifier.classifier import Classifier @@ -7,7 +5,6 @@ from image_prediction.classifier.image_classifier import ImageClassifier from image_prediction.compositor.compositor import TransformerCompositor from image_prediction.encoder.encoders.hash_encoder import HashEncoder from image_prediction.estimator.adapter.adapter import EstimatorAdapter -from image_prediction.formatter.formatter import format_image_plus from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor @@ -17,7 +14,6 @@ from image_prediction.model_loader.loaders.mlflow import MlflowConnector from image_prediction.redai_adapter.mlflow import MlflowModelReader from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer from image_prediction.transformer.transformers.response import ResponseTransformer -from pdf2img.extraction import extract_images_via_metadata def get_mlflow_model_loader(mlruns_dir): @@ -30,17 +26,10 @@ def get_image_classifier(model_loader, model_identifier): return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes))) -def get_dispatched_extract(**kwargs): +def get_extractor(**kwargs): image_extractor = ParsablePDFImageExtractor(**kwargs) - def extract(pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None): - if metadata_per_image: - image_pluses = extract_images_via_metadata(pdf, metadata_per_image) - yield from map(format_image_plus, image_pluses) - else: - yield from image_extractor.extract(pdf, page_range) - - return extract + return image_extractor def get_formatter(): diff --git a/image_prediction/formatter/formatter.py b/image_prediction/formatter/formatter.py index 53306a9..3f3a1f8 100644 --- a/image_prediction/formatter/formatter.py +++ b/image_prediction/formatter/formatter.py @@ -1,10 +1,6 @@ import abc -from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.info import Info - from image_prediction.transformer.transformer import Transformer -from pdf2img.default_objects.image import ImagePlus class Formatter(Transformer): @@ -17,19 +13,3 @@ class Formatter(Transformer): def __call__(self, obj): return self.format(obj) - - -def format_image_plus(image: ImagePlus) -> ImageMetadataPair: - enum_metadata = { - Info.PAGE_WIDTH: image.info.pageInfo.width, - Info.PAGE_HEIGHT: image.info.pageInfo.height, - Info.PAGE_IDX: image.info.pageInfo.number, - Info.ALPHA: image.info.alpha, - Info.WIDTH: image.info.boundingBox.width, - Info.HEIGHT: image.info.boundingBox.height, - Info.X1: image.info.boundingBox.x0, - Info.X2: image.info.boundingBox.x1, - Info.Y1: image.info.boundingBox.y0, - Info.Y2: image.info.boundingBox.y1, - } - return ImageMetadataPair(image.aspil(), enum_metadata) diff --git a/image_prediction/pipeline.py b/image_prediction/pipeline.py index f9383a1..704a88f 100644 --- a/image_prediction/pipeline.py +++ b/image_prediction/pipeline.py @@ -11,8 +11,8 @@ from image_prediction.default_objects import ( get_formatter, get_mlflow_model_loader, get_image_classifier, + get_extractor, get_encoder, - get_dispatched_extract, ) from image_prediction.locations import MLRUNS_DIR from image_prediction.utils.generic import lift, starlift @@ -41,7 +41,7 @@ class Pipeline: def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs): self.verbose = verbose - extract = get_dispatched_extract(**kwargs) + extract = get_extractor(**kwargs) classifier = get_image_classifier(model_loader, model_identifier) reformat = get_formatter() represent = get_encoder() @@ -63,9 +63,9 @@ class Pipeline: reformat, # ... the items ) - def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None): + def __call__(self, pdf: bytes, page_range: range = None): yield from tqdm( - self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image), + self.pipe(pdf, page_range=page_range), desc="Processing images from document", unit=" images", disable=not self.verbose, diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 29d3199..c2b4bb0 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -2,7 +2,6 @@ import argparse import json import os from glob import glob -from operator import truth from image_prediction.pipeline import load_pipeline from image_prediction.utils import get_logger @@ -15,7 +14,6 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", help="pdf file or directory") - parser.add_argument("--metadata", help="optional figure detection metadata") parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False) parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int) @@ -24,17 +22,13 @@ def parse_args(): return args -def process_pdf(pipeline, pdf_path, metadata=None, page_range=None): - if metadata: - with open(metadata) as f: - metadata = json.load(f) - +def process_pdf(pipeline, pdf_path, page_range=None): with open(pdf_path, "rb") as f: logger.info(f"Processing {pdf_path}") - predictions = list(pipeline(f.read(), page_range=page_range, metadata_per_image=metadata)) + predictions = list(pipeline(f.read(), page_range=page_range)) annotate_pdf( - pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", f"_{truth(metadata)}_annotated.pdf"))) + pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf"))) ) return predictions @@ -48,10 +42,9 @@ def main(args): else: pdf_paths = glob(os.path.join(args.input, "*.pdf")) page_range = range(*args.page_interval) if args.page_interval else None - metadata = args.metadata if args.metadata else None for pdf_path in pdf_paths: - predictions = process_pdf(pipeline, pdf_path, metadata, page_range=page_range) + predictions = process_pdf(pipeline, pdf_path, page_range=page_range) if args.print: print(pdf_path) print(json.dumps(predictions, indent=2)) diff --git a/src/serve.py b/src/serve.py index ece6a0b..4960563 100644 --- a/src/serve.py +++ b/src/serve.py @@ -1,5 +1,4 @@ import gzip -import io import json import logging @@ -31,32 +30,23 @@ def process_request(request_message): file_id = request_message["fileId"] target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}" response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}" - figure_data_file_name = f"{dossier_id}/{file_id}.FIGURE.json.gz" bucket = PYINFRA_CONFIG.storage_bucket storage = get_storage(PYINFRA_CONFIG) pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) - if storage.exists(bucket, target_file_name): + if not storage.exists(bucket, target_file_name): + should_publish_result = False + else: should_publish_result = True object_bytes = storage.get_object(bucket, target_file_name) object_bytes = gzip.decompress(object_bytes) classifications = list(pipeline(pdf=object_bytes)) - if storage.exists(bucket, figure_data_file_name): - metadata_bytes = storage.get_object(bucket, figure_data_file_name) - metadata_bytes = gzip.decompress(metadata_bytes) - metadata_per_image = json.load(io.BytesIO(metadata_bytes))["data"] - classifications_cv = list(pipeline(pdf=object_bytes, metadata_per_image=metadata_per_image)) - else: - classifications_cv = [] - - result = {**request_message, "data": classifications, "dataCV": classifications_cv} + result = {**request_message, "data": classifications} storage_bytes = gzip.compress(json.dumps(result).encode("utf-8")) storage.put_object(bucket, response_file_name, storage_bytes) - else: - should_publish_result = False return should_publish_result, {"dossierId": dossier_id, "fileId": file_id} From 4eb7f3c40a6f9e414bdf67a0f191d44627d2d2f7 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Tue, 31 Jan 2023 10:37:27 +0100 Subject: [PATCH 02/25] rename publishing flag --- src/serve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/serve.py b/src/serve.py index 4960563..d5cf053 100644 --- a/src/serve.py +++ b/src/serve.py @@ -37,9 +37,9 @@ def process_request(request_message): pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) if not storage.exists(bucket, target_file_name): - should_publish_result = False + publish_result = False else: - should_publish_result = True + publish_result = True object_bytes = storage.get_object(bucket, target_file_name) object_bytes = gzip.decompress(object_bytes) classifications = list(pipeline(pdf=object_bytes)) @@ -48,7 +48,7 @@ def process_request(request_message): storage_bytes = gzip.compress(json.dumps(result).encode("utf-8")) storage.put_object(bucket, response_file_name, storage_bytes) - return should_publish_result, {"dossierId": dossier_id, "fileId": file_id} + return publish_result, {"dossierId": dossier_id, "fileId": file_id} def main(): From 1490d273080900374cf2894d01b66192fb2e1189 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Tue, 31 Jan 2023 17:18:28 +0100 Subject: [PATCH 03/25] introduce adhoc filter for scanned pages --- .../image_extractor/extractors/parsable.py | 48 +++---------------- .../transformer/transformers/response.py | 13 ++--- test/unit_tests/image_extractor_test.py | 3 +- 3 files changed, 16 insertions(+), 48 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index eac09e1..0199d49 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -5,14 +5,16 @@ import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth -from typing import List, Iterable, Iterator +from typing import Iterable, Iterator import fitz from PIL import Image -from funcy import rcompose, merge, pluck, curry, compose +from funcy import merge, pluck, curry, compose from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair +from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata +from image_prediction.image_extractor.filters import filter_scanned_pages from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -44,6 +46,9 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs + # FIXME: Heuristic filtering shouldn't take place here, + # consider introducing a preprocessing step before extracting images. + @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): images = get_images_on_page(self.doc, page) metadata = get_metadata_for_images_on_page(self.doc, page) @@ -104,34 +109,12 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page): yield from metadata -@lru_cache(maxsize=None) -def get_image_infos(page: fitz.Page) -> List[dict]: - return page.get_image_info(xrefs=True) - - @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: maybe_image = load_image_handle_from_xref(doc, xref) return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None -def get_image_metadata(image_info): - - x1, y1, x2, y2 = map(rounder, image_info["bbox"]) - - width = abs(x2 - x1) - height = abs(y2 - y1) - - return { - Info.WIDTH: width, - Info.HEIGHT: height, - Info.X1: x1, - Info.X2: x2, - Info.Y1: y1, - Info.Y2: y2, - } - - def validate_coords_and_passthrough(metadata): yield from map(validate_box_coords, metadata) @@ -144,10 +127,6 @@ def validate_size_and_passthrough(metadata): yield from map(validate_box_size, metadata) -def add_page_metadata(page, metadata): - yield from map(partial(merge, get_page_metadata(page)), metadata) - - def add_alpha_channel_info(doc, page, metadata): page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos) @@ -166,19 +145,6 @@ def load_image_handle_from_xref(doc, xref): return doc.extract_image(xref) -rounder = rcompose(round, int) - - -def get_page_metadata(page): - page_width, page_height = map(rounder, page.mediabox_size) - - return { - Info.PAGE_WIDTH: page_width, - Info.PAGE_HEIGHT: page_height, - Info.PAGE_IDX: page.number, - } - - def has_alpha_channel(doc, xref): maybe_image = load_image_handle_from_xref(doc, xref) diff --git a/image_prediction/transformer/transformers/response.py b/image_prediction/transformer/transformers/response.py index 378fe7b..288c510 100644 --- a/image_prediction/transformer/transformers/response.py +++ b/image_prediction/transformer/transformers/response.py @@ -21,11 +21,6 @@ class ResponseTransformer(Transformer): def build_image_info(data: dict) -> dict: - def compute_geometric_quotient(): - page_area_sqrt = math.sqrt(abs(page_width * page_height)) - image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1)) - return image_area_sqrt / page_area_sqrt - page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter( "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha" )(data) @@ -34,7 +29,7 @@ def build_image_info(data: dict) -> dict: label = classification["label"] representation = data["representation"] - geometric_quotient = round(compute_geometric_quotient(), 4) + geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4) min_image_to_page_quotient_breached = bool( geometric_quotient < get_class_specific_min_image_to_page_quotient(label) @@ -89,6 +84,12 @@ def build_image_info(data: dict) -> dict: return image_info +def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1): + page_area_sqrt = math.sqrt(abs(page_width * page_height)) + image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1)) + return image_area_sqrt / page_area_sqrt + + def get_class_specific_min_image_to_page_quotient(label, table=None): return get_class_specific_value( "REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index e52b2b5..c7e1fc2 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -9,7 +9,8 @@ from funcy import first, rest from image_prediction.extraction import extract_images_from_pdf from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.image_extractor.extractors.parsable import extract_pages, get_image_infos, has_alpha_channel +from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel +from image_prediction.image_extractor.extractors.utils import get_image_infos from image_prediction.info import Info from test.utils.comparison import metadata_equal, image_sets_equal from test.utils.generation.pdf import add_image, pdf_stream From 368c54a8be62ebae4207dabb7b8c75dc0c7388ed Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 08:49:24 +0100 Subject: [PATCH 04/25] clean-up filter logic - Logic adapted so that it can potentially be easily removed again from the extraction logic --- .../image_extractor/extractors/parsable.py | 47 +++++++++++- image_prediction/image_extractor/filters.py | 74 +++++++++++++++++++ test/unit_tests/image_extractor_test.py | 3 +- 3 files changed, 118 insertions(+), 6 deletions(-) create mode 100644 image_prediction/image_extractor/filters.py diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 0199d49..970bc03 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -5,15 +5,14 @@ import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth -from typing import Iterable, Iterator +from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose +from funcy import merge, pluck, curry, compose, rcompose from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata from image_prediction.image_extractor.filters import filter_scanned_pages from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs @@ -47,7 +46,8 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs # FIXME: Heuristic filtering shouldn't take place here, - # consider introducing a preprocessing step before extracting images. + # consider introducing a preprocessing step before extracting images, + # e.g. together with a image validation step for broken images. @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): images = get_images_on_page(self.doc, page) @@ -109,12 +109,34 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page): yield from metadata +@lru_cache(maxsize=None) +def get_image_infos(page: fitz.Page) -> List[dict]: + return page.get_image_info(xrefs=True) + + @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: maybe_image = load_image_handle_from_xref(doc, xref) return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None +def get_image_metadata(image_info): + + x1, y1, x2, y2 = map(rounder, image_info["bbox"]) + + width = abs(x2 - x1) + height = abs(y2 - y1) + + return { + Info.WIDTH: width, + Info.HEIGHT: height, + Info.X1: x1, + Info.X2: x2, + Info.Y1: y1, + Info.Y2: y2, + } + + def validate_coords_and_passthrough(metadata): yield from map(validate_box_coords, metadata) @@ -127,6 +149,10 @@ def validate_size_and_passthrough(metadata): yield from map(validate_box_size, metadata) +def add_page_metadata(page, metadata): + yield from map(partial(merge, get_page_metadata(page)), metadata) + + def add_alpha_channel_info(doc, page, metadata): page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos) @@ -145,6 +171,19 @@ def load_image_handle_from_xref(doc, xref): return doc.extract_image(xref) +rounder = rcompose(round, int) + + +def get_page_metadata(page): + page_width, page_height = map(rounder, page.mediabox_size) + + return { + Info.PAGE_WIDTH: page_width, + Info.PAGE_HEIGHT: page_height, + Info.PAGE_IDX: page.number, + } + + def has_alpha_channel(doc, xref): maybe_image = load_image_handle_from_xref(doc, xref) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py new file mode 100644 index 0000000..d3f59b1 --- /dev/null +++ b/image_prediction/image_extractor/filters.py @@ -0,0 +1,74 @@ +from _operator import itemgetter +from typing import Callable, List + +import fitz +from funcy import first, compose, lmap + +from image_prediction.config import CONFIG +from image_prediction.image_extractor.extractor import ImageExtractor +from image_prediction.transformer.transformers.response import compute_geometric_quotient +from image_prediction.utils import get_logger + +logger = get_logger() + + +def filter_scanned_pages(page_processor: Callable): + """Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor. + This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline). + A scanned page is defined by + - having only one image on a page + - that image having an image_to_page ratio greater than the allowed max value + found in the CONFIG.filters.image_to_page_quotient.max""" + + def inner(self: ImageExtractor, page: fitz.fitz.Page): + metadata = get_metadata(page) + if is_a_scanned_page(metadata): + logger.debug(f"Page {page.number} won't be processed since it is a scanned page.") + yield from [] + else: + yield from page_processor(self, page) + + logger.info(f"Extracting pages with filtering scanned pages...") + return inner + + +def is_a_scanned_page(metadata: List[dict]): + return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata)) + + +def _breaches_image_to_page_quotient(metadata): + page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( + "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height" + )(metadata) + geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4) + quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) + return quotient_breached + + +def _contains_only_one_image(metadata): + return True if len(metadata) == 1 else False + + +def get_metadata(page: fitz.fitz.Page): + def get_image_metadata(image_info): + x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"]) + width = abs(x2 - x1) + height = abs(y2 - y1) + return { + "page_width": page_width, + "page_height": page_height, + "page_number": page_number, + "width": width, + "height": height, + "x1": x1, + "x2": x2, + "y1": y1, + "y2": y2, + } + + image_infos = page.get_image_info(xrefs=True) + page_number = page.number + page_width, page_height = page.mediabox_size + + metadata = lmap(get_image_metadata, image_infos) + return metadata diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index c7e1fc2..8e6916c 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -9,8 +9,7 @@ from funcy import first, rest from image_prediction.extraction import extract_images_from_pdf from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel -from image_prediction.image_extractor.extractors.utils import get_image_infos +from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel, get_image_infos from image_prediction.info import Info from test.utils.comparison import metadata_equal, image_sets_equal from test.utils.generation.pdf import add_image, pdf_stream From 876260f4032024fc45c19422756d7357dd58f28e Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 10:08:36 +0100 Subject: [PATCH 05/25] improve the readability of variable names and docstrings --- image_prediction/image_extractor/extractors/parsable.py | 2 +- image_prediction/image_extractor/filters.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 970bc03..a0c24c1 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -47,7 +47,7 @@ class ParsablePDFImageExtractor(ImageExtractor): # FIXME: Heuristic filtering shouldn't take place here, # consider introducing a preprocessing step before extracting images, - # e.g. together with a image validation step for broken images. + # e.g. together with an image validation step for broken images. @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): images = get_images_on_page(self.doc, page) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py index d3f59b1..d6cb2f6 100644 --- a/image_prediction/image_extractor/filters.py +++ b/image_prediction/image_extractor/filters.py @@ -16,17 +16,17 @@ def filter_scanned_pages(page_processor: Callable): """Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor. This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline). A scanned page is defined by - - having only one image on a page + - having only one image on a page and - that image having an image_to_page ratio greater than the allowed max value - found in the CONFIG.filters.image_to_page_quotient.max""" + defined in CONFIG.filters.image_to_page_quotient.max""" - def inner(self: ImageExtractor, page: fitz.fitz.Page): + def inner(extractor: ImageExtractor, page: fitz.fitz.Page): metadata = get_metadata(page) if is_a_scanned_page(metadata): logger.debug(f"Page {page.number} won't be processed since it is a scanned page.") yield from [] else: - yield from page_processor(self, page) + yield from page_processor(extractor, page) logger.info(f"Extracting pages with filtering scanned pages...") return inner From 5bd5e0cf2bb91f055b7584d8ab876ee18a1e2e4f Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 12:43:49 +0100 Subject: [PATCH 06/25] refactor - reduce code duplication by adapting functions of the module - use the modules enums for image metadata - improve readabilty of the scanned page detection heuristic --- image_prediction/image_extractor/filters.py | 55 ++++++++------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py index d6cb2f6..720ead0 100644 --- a/image_prediction/image_extractor/filters.py +++ b/image_prediction/image_extractor/filters.py @@ -1,11 +1,12 @@ from _operator import itemgetter -from typing import Callable, List +from typing import Callable import fitz -from funcy import first, compose, lmap +from funcy import first, lmap, second from image_prediction.config import CONFIG from image_prediction.image_extractor.extractor import ImageExtractor +from image_prediction.info import Info from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger @@ -21,10 +22,8 @@ def filter_scanned_pages(page_processor: Callable): defined in CONFIG.filters.image_to_page_quotient.max""" def inner(extractor: ImageExtractor, page: fitz.fitz.Page): - metadata = get_metadata(page) - if is_a_scanned_page(metadata): - logger.debug(f"Page {page.number} won't be processed since it is a scanned page.") - yield from [] + if is_a_scanned_page(page): + logger.info(f"Page {page.number} is a scanned page; skipping image extraction.") else: yield from page_processor(extractor, page) @@ -32,43 +31,29 @@ def filter_scanned_pages(page_processor: Callable): return inner -def is_a_scanned_page(metadata: List[dict]): - return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata)) +def is_a_scanned_page(page: fitz.fitz.Page): + metadata = get_metadata_for_images_on_page(page) + return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata) -def _breaches_image_to_page_quotient(metadata): +def __breaches_image_to_page_quotient(metadatum): page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( - "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height" - )(metadata) - geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4) + Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT + )(metadatum) + geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1) quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) return quotient_breached -def _contains_only_one_image(metadata): - return True if len(metadata) == 1 else False - - -def get_metadata(page: fitz.fitz.Page): - def get_image_metadata(image_info): - x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"]) - width = abs(x2 - x1) - height = abs(y2 - y1) - return { - "page_width": page_width, - "page_height": page_height, - "page_number": page_number, - "width": width, - "height": height, - "x1": x1, - "x2": x2, - "y1": y1, - "y2": y2, - } +def get_metadata_for_images_on_page(page: fitz.fitz.Page): + """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, + however without the validation steps since not required here and take a significant amount of time. + """ + # temporary solution to avoid circular imports without changing the original code + from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata image_infos = page.get_image_info(xrefs=True) - page_number = page.number - page_width, page_height = page.mediabox_size - metadata = lmap(get_image_metadata, image_infos) + metadata = add_page_metadata(page, metadata) + return metadata From 2d385b0a73210eefed3852b176a93965271dc110 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 14:38:55 +0100 Subject: [PATCH 07/25] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 42 ++++++++++------- image_prediction/image_extractor/filters.py | 45 ++++--------------- image_prediction/info.py | 1 + test/utils/comparison.py | 8 +++- 4 files changed, 42 insertions(+), 54 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index a0c24c1..cd5a505 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -9,11 +9,11 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose +from funcy import merge, pluck, curry, compose, rcompose, lmap from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.filters import filter_scanned_pages +from image_prediction.image_extractor.filters import filter_metadata_for_scanned_pages from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -45,13 +45,10 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs - # FIXME: Heuristic filtering shouldn't take place here, - # consider introducing a preprocessing step before extracting images, - # e.g. together with an image validation step for broken images. - @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): - images = get_images_on_page(self.doc, page) - metadata = get_metadata_for_images_on_page(self.doc, page) + metadata = list(get_metadata_for_images_on_page(self.doc, page)) + metadata = filter_metadata_for_scanned_pages(metadata) + images = get_images_on_page(self.doc, metadata) clear_caches() image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata))) @@ -85,10 +82,8 @@ def extract_pages(doc, page_range): yield from pages -@lru_cache(maxsize=None) -def get_images_on_page(doc, page: fitz.Page): - image_infos = get_image_infos(page) - xrefs = map(itemgetter("xref"), image_infos) +def get_images_on_page(doc, metadata): + xrefs = pluck(Info.XREF, metadata) images = map(partial(xref_to_image, doc), xrefs) yield from images @@ -97,18 +92,32 @@ def get_images_on_page(doc, page: fitz.Page): def get_metadata_for_images_on_page(doc, page: fitz.Page): metadata = map(get_image_metadata, get_image_infos(page)) - metadata = validate_coords_and_passthrough(metadata) metadata = filter_out_tiny_images(metadata) - metadata = validate_size_and_passthrough(metadata) metadata = add_page_metadata(page, metadata) metadata = add_alpha_channel_info(doc, page, metadata) + metadata = validate_coords_and_passthrough(metadata) + metadata = validate_size_and_passthrough(metadata) yield from metadata +# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page): +# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, +# however without the validation steps since not required here and take a significant amount of time. +# """ +# # temporary solution to avoid circular imports without changing the original code +# from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata +# +# image_infos = page.get_image_info(xrefs=True) +# metadata = lmap(get_image_metadata, image_infos) +# metadata = add_page_metadata(page, metadata) +# +# return metadata + + @lru_cache(maxsize=None) def get_image_infos(page: fitz.Page) -> List[dict]: return page.get_image_info(xrefs=True) @@ -122,7 +131,8 @@ def xref_to_image(doc, xref) -> Image: def get_image_metadata(image_info): - x1, y1, x2, y2 = map(rounder, image_info["bbox"]) + xref, coords = itemgetter("xref", "bbox")(image_info) + x1, y1, x2, y2 = map(rounder, coords) width = abs(x2 - x1) height = abs(y2 - y1) @@ -134,6 +144,7 @@ def get_image_metadata(image_info): Info.X2: x2, Info.Y1: y1, Info.Y2: y2, + Info.XREF: xref, } @@ -206,7 +217,6 @@ def tiny(metadata): def clear_caches(): get_image_infos.cache_clear() load_image_handle_from_xref.cache_clear() - get_images_on_page.cache_clear() xref_to_image.cache_clear() diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py index 720ead0..e77b84b 100644 --- a/image_prediction/image_extractor/filters.py +++ b/image_prediction/image_extractor/filters.py @@ -1,11 +1,9 @@ from _operator import itemgetter -from typing import Callable +from typing import List -import fitz -from funcy import first, lmap, second +from funcy import first, second from image_prediction.config import CONFIG -from image_prediction.image_extractor.extractor import ImageExtractor from image_prediction.info import Info from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger @@ -13,26 +11,15 @@ from image_prediction.utils import get_logger logger = get_logger() -def filter_scanned_pages(page_processor: Callable): - """Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor. - This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline). - A scanned page is defined by - - having only one image on a page and - - that image having an image_to_page ratio greater than the allowed max value - defined in CONFIG.filters.image_to_page_quotient.max""" - - def inner(extractor: ImageExtractor, page: fitz.fitz.Page): - if is_a_scanned_page(page): - logger.info(f"Page {page.number} is a scanned page; skipping image extraction.") - else: - yield from page_processor(extractor, page) - - logger.info(f"Extracting pages with filtering scanned pages...") - return inner +def filter_metadata_for_scanned_pages(metadata: List[dict]): + assert isinstance(metadata, list) + if is_metadata_of_a_scanned_page(metadata): + logger.info(f"Page {first(metadata)[Info.PAGE_IDX]} is a scanned page; skipping image extraction.") + return [] + return metadata -def is_a_scanned_page(page: fitz.fitz.Page): - metadata = get_metadata_for_images_on_page(page) +def is_metadata_of_a_scanned_page(metadata): return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata) @@ -43,17 +30,3 @@ def __breaches_image_to_page_quotient(metadatum): geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1) quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) return quotient_breached - - -def get_metadata_for_images_on_page(page: fitz.fitz.Page): - """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, - however without the validation steps since not required here and take a significant amount of time. - """ - # temporary solution to avoid circular imports without changing the original code - from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata - - image_infos = page.get_image_info(xrefs=True) - metadata = lmap(get_image_metadata, image_infos) - metadata = add_page_metadata(page, metadata) - - return metadata diff --git a/image_prediction/info.py b/image_prediction/info.py index 344274a..987779e 100644 --- a/image_prediction/info.py +++ b/image_prediction/info.py @@ -12,3 +12,4 @@ class Info(Enum): Y1 = "y1" Y2 = "y2" ALPHA = "alpha" + XREF = "xref" diff --git a/test/utils/comparison.py b/test/utils/comparison.py index f2677ce..b4c8d14 100644 --- a/test/utils/comparison.py +++ b/test/utils/comparison.py @@ -1,12 +1,15 @@ +from functools import partial from itertools import starmap, product, repeat from typing import Iterable import numpy as np from PIL.Image import Image from frozendict import frozendict -from funcy import ilen +from funcy import ilen, compose, omit from image_prediction.estimator.preprocessor.utils import image_to_normalized_tensor +from image_prediction.info import Info +from image_prediction.utils.generic import lift def transform_equal(a, b): @@ -18,7 +21,8 @@ def images_equal(im1: Image, im2: Image, **kwargs): def metadata_equal(mdat1: Iterable[dict], mdat2: Iterable[dict]): - return set(map(frozendict, mdat1)) == set(map(frozendict, mdat2)) + f = compose(set, lift(compose(frozendict, partial(omit, keys=[Info.XREF])))) + return f(mdat1) == f(mdat2) def image_sets_equal(ims1: Iterable[Image], ims2: Iterable[Image]): From 9ec6cc19bab45a0d473cb4a8c892a6e55d79b3eb Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 14:53:26 +0100 Subject: [PATCH 08/25] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index cd5a505..5010c25 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -9,7 +9,7 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, lmap +from funcy import merge, pluck, curry, compose, rcompose from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair @@ -46,8 +46,11 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs def __process_images_on_page(self, page: fitz.fitz.Page): - metadata = list(get_metadata_for_images_on_page(self.doc, page)) + metadata = list(get_metadata_for_images_on_page(page)) metadata = filter_metadata_for_scanned_pages(metadata) + metadata = list(filter_out_tiny_images(metadata)) + metadata = list(filter_invalid_metadata(metadata)) + metadata = add_alpha_channel_info(self.doc, page, metadata) images = get_images_on_page(self.doc, metadata) clear_caches() @@ -89,21 +92,19 @@ def get_images_on_page(doc, metadata): yield from images -def get_metadata_for_images_on_page(doc, page: fitz.Page): +def get_metadata_for_images_on_page(page: fitz.Page): metadata = map(get_image_metadata, get_image_infos(page)) - metadata = filter_out_tiny_images(metadata) - metadata = add_page_metadata(page, metadata) - metadata = add_alpha_channel_info(doc, page, metadata) - - metadata = validate_coords_and_passthrough(metadata) - metadata = validate_size_and_passthrough(metadata) yield from metadata +def filter_invalid_metadata(metadata): + return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) + + # def get_metadata_for_images_on_page_2(page: fitz.fitz.Page): # """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, # however without the validation steps since not required here and take a significant amount of time. From 436a32ad2b4ea213c0c0724e30ae75deef6ff3cf Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 15:07:35 +0100 Subject: [PATCH 09/25] refactor scanned page filtering WIP --- image_prediction/image_extractor/extractors/parsable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 5010c25..f6bbc82 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -50,8 +50,11 @@ class ParsablePDFImageExtractor(ImageExtractor): metadata = filter_metadata_for_scanned_pages(metadata) metadata = list(filter_out_tiny_images(metadata)) metadata = list(filter_invalid_metadata(metadata)) + metadata = add_alpha_channel_info(self.doc, page, metadata) + images = get_images_on_page(self.doc, metadata) + clear_caches() image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata))) @@ -173,7 +176,7 @@ def add_alpha_channel_info(doc, page, metadata): alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)]) page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image) - metadata = starmap(merge, zip(page_to_alpha_mapping_per_image(page), metadata)) + metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page))) yield from metadata From 0f440bdb09dbf3e5672f90ba1fe92452e013406f Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 15:14:27 +0100 Subject: [PATCH 10/25] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index f6bbc82..4f80723 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -9,11 +9,14 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose +from funcy import merge, pluck, curry, compose, rcompose, remove from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.filters import filter_metadata_for_scanned_pages +from image_prediction.image_extractor.filters import ( + filter_metadata_for_scanned_pages, + __breaches_image_to_page_quotient, +) from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -47,7 +50,7 @@ class ParsablePDFImageExtractor(ImageExtractor): def __process_images_on_page(self, page: fitz.fitz.Page): metadata = list(get_metadata_for_images_on_page(page)) - metadata = filter_metadata_for_scanned_pages(metadata) + metadata = list(filter_out_page_sized_images(metadata)) metadata = list(filter_out_tiny_images(metadata)) metadata = list(filter_invalid_metadata(metadata)) @@ -108,18 +111,8 @@ def filter_invalid_metadata(metadata): return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) -# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page): -# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, -# however without the validation steps since not required here and take a significant amount of time. -# """ -# # temporary solution to avoid circular imports without changing the original code -# from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata -# -# image_infos = page.get_image_info(xrefs=True) -# metadata = lmap(get_image_metadata, image_infos) -# metadata = add_page_metadata(page, metadata) -# -# return metadata +def filter_out_page_sized_images(metadata): + yield from remove(__breaches_image_to_page_quotient, metadata) @lru_cache(maxsize=None) From c55777e3394c98a41deba7af406638f034a0777c Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 15:16:12 +0100 Subject: [PATCH 11/25] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 13 ++++++++++++- image_prediction/image_extractor/filters.py | 11 +---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 4f80723..0af1f10 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -2,6 +2,7 @@ import atexit import io import json import traceback +from _operator import itemgetter from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth @@ -11,15 +12,16 @@ import fitz from PIL import Image from funcy import merge, pluck, curry, compose, rcompose, remove +from image_prediction.config import CONFIG from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.image_extractor.filters import ( filter_metadata_for_scanned_pages, - __breaches_image_to_page_quotient, ) from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size +from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger from image_prediction.utils.generic import lift @@ -218,3 +220,12 @@ def clear_caches(): atexit.register(clear_caches) + + +def __breaches_image_to_page_quotient(metadatum): + page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( + Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT + )(metadatum) + geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1) + quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) + return quotient_breached diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py index e77b84b..13053de 100644 --- a/image_prediction/image_extractor/filters.py +++ b/image_prediction/image_extractor/filters.py @@ -1,11 +1,9 @@ -from _operator import itemgetter from typing import List from funcy import first, second -from image_prediction.config import CONFIG +from image_prediction.image_extractor.extractors.parsable import __breaches_image_to_page_quotient from image_prediction.info import Info -from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger logger = get_logger() @@ -23,10 +21,3 @@ def is_metadata_of_a_scanned_page(metadata): return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata) -def __breaches_image_to_page_quotient(metadatum): - page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( - Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT - )(metadatum) - geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1) - quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) - return quotient_breached From 8c7349c2d1ddb57286324aa1af59bc2a7b1e5cf1 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 15:36:16 +0100 Subject: [PATCH 12/25] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 34 +++++++++++-------- image_prediction/image_extractor/filters.py | 23 ------------- 2 files changed, 20 insertions(+), 37 deletions(-) delete mode 100644 image_prediction/image_extractor/filters.py diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 0af1f10..8cf6ff1 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -15,9 +15,6 @@ from funcy import merge, pluck, curry, compose, rcompose, remove from image_prediction.config import CONFIG from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.filters import ( - filter_metadata_for_scanned_pages, -) from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -51,12 +48,10 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs def __process_images_on_page(self, page: fitz.fitz.Page): - metadata = list(get_metadata_for_images_on_page(page)) - metadata = list(filter_out_page_sized_images(metadata)) - metadata = list(filter_out_tiny_images(metadata)) - metadata = list(filter_invalid_metadata(metadata)) + metadata = get_metadata_for_images_on_page(page) + metadata = the_great_filter(metadata) - metadata = add_alpha_channel_info(self.doc, page, metadata) + metadata = list(add_alpha_channel_info(self.doc, page, metadata)) images = get_images_on_page(self.doc, metadata) @@ -109,12 +104,27 @@ def get_metadata_for_images_on_page(page: fitz.Page): yield from metadata +def the_great_filter(metadata): + return compose( + list, + filter_out_page_sized_images, + list, + filter_out_tiny_images, + list, + filter_invalid_metadata, + )(metadata) + + def filter_invalid_metadata(metadata): return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) def filter_out_page_sized_images(metadata): - yield from remove(__breaches_image_to_page_quotient, metadata) + yield from remove(breaches_image_to_page_quotient, metadata) + + +def filter_out_tiny_images(metadata): + yield from filterfalse(tiny, metadata) @lru_cache(maxsize=None) @@ -151,10 +161,6 @@ def validate_coords_and_passthrough(metadata): yield from map(validate_box_coords, metadata) -def filter_out_tiny_images(metadata): - yield from filterfalse(tiny, metadata) - - def validate_size_and_passthrough(metadata): yield from map(validate_box_size, metadata) @@ -222,7 +228,7 @@ def clear_caches(): atexit.register(clear_caches) -def __breaches_image_to_page_quotient(metadatum): +def breaches_image_to_page_quotient(metadatum): page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT )(metadatum) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py deleted file mode 100644 index 13053de..0000000 --- a/image_prediction/image_extractor/filters.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import List - -from funcy import first, second - -from image_prediction.image_extractor.extractors.parsable import __breaches_image_to_page_quotient -from image_prediction.info import Info -from image_prediction.utils import get_logger - -logger = get_logger() - - -def filter_metadata_for_scanned_pages(metadata: List[dict]): - assert isinstance(metadata, list) - if is_metadata_of_a_scanned_page(metadata): - logger.info(f"Page {first(metadata)[Info.PAGE_IDX]} is a scanned page; skipping image extraction.") - return [] - return metadata - - -def is_metadata_of_a_scanned_page(metadata): - return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata) - - From b880e892ec1b0c9e1d8370711e8ab618cee8e1fe Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 15:47:40 +0100 Subject: [PATCH 13/25] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 8cf6ff1..48b6e6c 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -10,7 +10,7 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, remove +from funcy import merge, pluck, curry, compose, rcompose, remove, rpartial from image_prediction.config import CONFIG from image_prediction.formatter.formatters.enum import EnumFormatter @@ -48,11 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs def __process_images_on_page(self, page: fitz.fitz.Page): - metadata = get_metadata_for_images_on_page(page) - metadata = the_great_filter(metadata) - - metadata = list(add_alpha_channel_info(self.doc, page, metadata)) - + metadata = extract_valid_metadata(self.doc, page) images = get_images_on_page(self.doc, metadata) clear_caches() @@ -95,6 +91,15 @@ def get_images_on_page(doc, metadata): yield from images +def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): + return compose( + list, + partial(add_alpha_channel_info, doc, page), + filter_valid_metadata, + get_metadata_for_images_on_page, + )(page) + + def get_metadata_for_images_on_page(page: fitz.Page): metadata = map(get_image_metadata, get_image_infos(page)) @@ -104,13 +109,10 @@ def get_metadata_for_images_on_page(page: fitz.Page): yield from metadata -def the_great_filter(metadata): - return compose( - list, +def filter_valid_metadata(metadata): + yield from compose( filter_out_page_sized_images, - list, filter_out_tiny_images, - list, filter_invalid_metadata, )(metadata) From 2385584dcbee774ad9c61aba65f430b81b5ea36a Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 15:49:36 +0100 Subject: [PATCH 14/25] refactor scanned page filtering --- image_prediction/image_extractor/extractors/parsable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 48b6e6c..ddb392d 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -10,7 +10,7 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, remove, rpartial +from funcy import merge, pluck, curry, compose, rcompose, remove from image_prediction.config import CONFIG from image_prediction.formatter.formatters.enum import EnumFormatter From 73f7491c8f9c01967f95b9e0210779878f32f99c Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 2 Feb 2023 13:36:58 +0100 Subject: [PATCH 15/25] improve performance - disable scanned page filter, since dropping these disables the computation of the images hash and the frontend OCR hint, which are both wanted - optimize image extraction by using arrays instead of byte streams for the conversion to PIL images --- image_prediction/image_extractor/extractors/parsable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index ddb392d..b65e3d3 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -1,5 +1,4 @@ import atexit -import io import json import traceback from _operator import itemgetter @@ -9,6 +8,7 @@ from operator import itemgetter, truth from typing import Iterable, Iterator, List import fitz +import numpy as np from PIL import Image from funcy import merge, pluck, curry, compose, rcompose, remove @@ -111,7 +111,6 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( - filter_out_page_sized_images, filter_out_tiny_images, filter_invalid_metadata, )(metadata) @@ -136,8 +135,9 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: - maybe_image = load_image_handle_from_xref(doc, xref) - return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None + pixmap = fitz.Pixmap(doc, xref) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w) + return Image.fromarray(array) def get_image_metadata(image_info): From c0b41e77b882520d6ef385ac74997c33b5e038a4 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 2 Feb 2023 13:57:56 +0100 Subject: [PATCH 16/25] implement ad hoc channel count detection for new image extraction --- image_prediction/image_extractor/extractors/parsable.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index b65e3d3..941f431 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -136,7 +136,11 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: pixmap = fitz.Pixmap(doc, xref) - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w) + # FIXME: implement proper logic to determine how many channels the image has + try: + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w) + except ValueError: + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) return Image.fromarray(array) From c4416636c0a72b879e9b8d511f791a5e00b301fa Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 2 Feb 2023 14:10:32 +0100 Subject: [PATCH 17/25] beautify --- image_prediction/image_extractor/extractors/parsable.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 941f431..a847e17 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -136,11 +136,8 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: pixmap = fitz.Pixmap(doc, xref) - # FIXME: implement proper logic to determine how many channels the image has - try: - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w) - except ValueError: - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + array = array[:, :, 0] if array.shape[2] == 1 else array return Image.fromarray(array) From 94652aafe47910b0646e3d22b4114376db37d982 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 2 Feb 2023 15:26:33 +0100 Subject: [PATCH 18/25] beautify --- image_prediction/image_extractor/extractors/parsable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index a847e17..805adc8 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -111,6 +111,7 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( + # filter_out_page_sized_images, TODO: Link to concept for extraction toggling and reclassification endpoint filter_out_tiny_images, filter_invalid_metadata, )(metadata) @@ -135,8 +136,10 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: + # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream pixmap = fitz.Pixmap(doc, xref) array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) array = array[:, :, 0] if array.shape[2] == 1 else array return Image.fromarray(array) From 978f48e8f9b1167f5b483b775869e55d3c6ba56f Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 2 Feb 2023 15:39:44 +0100 Subject: [PATCH 19/25] add ad hoc logic for bad xref handling --- .../image_extractor/extractors/parsable.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 805adc8..27e0f33 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -111,9 +111,10 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( - # filter_out_page_sized_images, TODO: Link to concept for extraction toggling and reclassification endpoint - filter_out_tiny_images, - filter_invalid_metadata, + # filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint. + filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter + filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images, + # This should be done in filter_invalid_metadata. )(metadata) @@ -137,7 +138,11 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream - pixmap = fitz.Pixmap(doc, xref) + try: + pixmap = fitz.Pixmap(doc, xref) + except ValueError: + # FIXME: Invalid xrefs occur here, this shouldn't be the case. + return array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) array = array[:, :, 0] if array.shape[2] == 1 else array From c478333111ff4857e5c6ff09fd8971374b560c27 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 3 Feb 2023 08:25:36 +0100 Subject: [PATCH 20/25] add log in callback to diplay which file is processed --- src/serve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/serve.py b/src/serve.py index d5cf053..a865c7d 100644 --- a/src/serve.py +++ b/src/serve.py @@ -28,6 +28,7 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root) def process_request(request_message): dossier_id = request_message["dossierId"] file_id = request_message["fileId"] + logger.info(f"Processing {dossier_id=} {file_id=} ...") target_file_name = f"{dossier_id}/{file_id}.{request_message['targetFileExtension']}" response_file_name = f"{dossier_id}/{file_id}.{request_message['responseFileExtension']}" From eff1bb41245ed29eb5cf86f0ee28b13692ec998d Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 3 Feb 2023 09:04:02 +0100 Subject: [PATCH 21/25] adjust behavior of filtering of invalid images --- .../image_extractor/extractors/parsable.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 27e0f33..303021b 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -10,14 +10,15 @@ from typing import Iterable, Iterator, List import fitz import numpy as np from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, remove +from funcy import merge, pluck, curry, compose, rcompose, remove, keep from image_prediction.config import CONFIG +from image_prediction.exceptions import InvalidBox from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs -from image_prediction.stitching.utils import validate_box_coords, validate_box_size +from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger from image_prediction.utils.generic import lift @@ -101,9 +102,7 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): def get_metadata_for_images_on_page(page: fitz.Page): - metadata = map(get_image_metadata, get_image_infos(page)) - metadata = add_page_metadata(page, metadata) yield from metadata @@ -111,15 +110,23 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( - # filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint. - filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter - filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images, - # This should be done in filter_invalid_metadata. + # TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even + # scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images + # and giving the user the ability to reclassify false positives with a separate call. + # filter_out_page_sized_images, + filter_out_tiny_images, + filter_invalid_metadata, )(metadata) def filter_invalid_metadata(metadata): - return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) + def invalid_box_filter(box): + try: + return validate_box(box) + except InvalidBox as e: + logger.debug(f"Dropping invalid metadatum, reason: {e}") + + yield from keep(invalid_box_filter, metadata) def filter_out_page_sized_images(metadata): @@ -142,6 +149,7 @@ def xref_to_image(doc, xref) -> Image: pixmap = fitz.Pixmap(doc, xref) except ValueError: # FIXME: Invalid xrefs occur here, this shouldn't be the case. + logger.debug(f"Xref {xref} is invalid, skipping extraction ...") return array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) From 2995d5ee4827f5233cced0d469efc764407e60ea Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 3 Feb 2023 11:14:14 +0100 Subject: [PATCH 22/25] refactoring --- .../image_extractor/extractors/parsable.py | 51 ++++++++----------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 303021b..75c98f2 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -5,7 +5,7 @@ from _operator import itemgetter from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth -from typing import Iterable, Iterator, List +from typing import Iterable, Iterator, List, Union import fitz import numpy as np @@ -18,7 +18,7 @@ from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs -from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box +from image_prediction.stitching.utils import validate_box from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger from image_prediction.utils.generic import lift @@ -95,7 +95,7 @@ def get_images_on_page(doc, metadata): def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): return compose( list, - partial(add_alpha_channel_info, doc, page), + partial(add_alpha_channel_info, doc), filter_valid_metadata, get_metadata_for_images_on_page, )(page) @@ -115,18 +115,18 @@ def filter_valid_metadata(metadata): # and giving the user the ability to reclassify false positives with a separate call. # filter_out_page_sized_images, filter_out_tiny_images, - filter_invalid_metadata, + filter_out_invalid_metadata, )(metadata) -def filter_invalid_metadata(metadata): - def invalid_box_filter(box): +def filter_out_invalid_metadata(metadata): + def __validate_box(box): try: return validate_box(box) - except InvalidBox as e: - logger.debug(f"Dropping invalid metadatum, reason: {e}") + except InvalidBox as err: + logger.debug(f"Dropping invalid metadatum, reason: {err}") - yield from keep(invalid_box_filter, metadata) + yield from keep(__validate_box, metadata) def filter_out_page_sized_images(metadata): @@ -143,18 +143,17 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) -def xref_to_image(doc, xref) -> Image: +def xref_to_image(doc, xref) -> Union[Image.Image, None]: # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream try: pixmap = fitz.Pixmap(doc, xref) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) + array = array[:, :, 0] if array.shape[2] == 1 else array + return Image.fromarray(array) except ValueError: - # FIXME: Invalid xrefs occur here, this shouldn't be the case. logger.debug(f"Xref {xref} is invalid, skipping extraction ...") return - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) - array = array[:, :, 0] if array.shape[2] == 1 else array - return Image.fromarray(array) def get_image_metadata(image_info): @@ -176,29 +175,19 @@ def get_image_metadata(image_info): } -def validate_coords_and_passthrough(metadata): - yield from map(validate_box_coords, metadata) - - -def validate_size_and_passthrough(metadata): - yield from map(validate_box_size, metadata) - - def add_page_metadata(page, metadata): yield from map(partial(merge, get_page_metadata(page)), metadata) -def add_alpha_channel_info(doc, page, metadata): +def add_alpha_channel_info(doc, metadata): + def add_alpha_value_to_metadatum(metadatum): + alpha = metadatum_to_alpha_value(metadatum) + return {**metadatum, Info.ALPHA: alpha} - page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos) xref_to_alpha = partial(has_alpha_channel, doc) - page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs) - alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)]) - page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image) + metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF)) - metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page))) - - yield from metadata + yield from map(add_alpha_value_to_metadatum, metadata) @lru_cache(maxsize=None) From bd0279ddd11895c788edbd8c57b78064163647c5 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 3 Feb 2023 12:25:27 +0100 Subject: [PATCH 23/25] introduce normalizing function for image extraction --- .../image_extractor/extractors/parsable.py | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 75c98f2..9da3ec7 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -10,7 +10,7 @@ from typing import Iterable, Iterator, List, Union import fitz import numpy as np from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, remove, keep +from funcy import merge, pluck, compose, rcompose, remove, keep from image_prediction.config import CONFIG from image_prediction.exceptions import InvalidBox @@ -21,7 +21,6 @@ from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger -from image_prediction.utils.generic import lift logger = get_logger() @@ -148,14 +147,32 @@ def xref_to_image(doc, xref) -> Union[Image.Image, None]: try: pixmap = fitz.Pixmap(doc, xref) array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) - array = array[:, :, 0] if array.shape[2] == 1 else array + array = normalize_channels(array) + print(array.shape) + assert array.shape[-1] == 3 + # # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) + # array = array[:, :, 0] if array.shape[2] == 1 else array return Image.fromarray(array) except ValueError: logger.debug(f"Xref {xref} is invalid, skipping extraction ...") return +def normalize_channels(array: np.ndarray): + if not array.ndim == 3: + array = np.expand_dims(array, axis=-1) + + if array.shape[-1] == 4: + array = array[..., :3] + elif array.shape[-1] == 1: + array = np.concatenate([array, array, array], axis=-1) + elif array.shape[-1] != 3: + logger.warning(f"Unexpected image format: {array.shape}.") + raise ValueError(f"Unexpected image format: {array.shape}.") + + return array + + def get_image_metadata(image_info): xref, coords = itemgetter("xref", "bbox")(image_info) From 4d43e385c5fb919b3e9e7ca4b8f76cc140413a8c Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Mon, 6 Feb 2023 09:43:28 +0100 Subject: [PATCH 24/25] replace image extraction logic final --- image_prediction/image_extractor/extractors/parsable.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 9da3ec7..3f3f21e 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -148,10 +148,6 @@ def xref_to_image(doc, xref) -> Union[Image.Image, None]: pixmap = fitz.Pixmap(doc, xref) array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) array = normalize_channels(array) - print(array.shape) - assert array.shape[-1] == 3 - # # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) - # array = array[:, :, 0] if array.shape[2] == 1 else array return Image.fromarray(array) except ValueError: logger.debug(f"Xref {xref} is invalid, skipping extraction ...") From 2bc9c24f6ae34b6a738578c6ef23c4121c213ebf Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Mon, 13 Feb 2023 13:53:30 +0100 Subject: [PATCH 25/25] revert refactoring changes - revert functional refactoring changes to be able to determine where the error described in the ticket comes from - change array normalization to dimensionally sparse arrays to reduce memory consumption --- .../image_extractor/extractors/parsable.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 3f3f21e..ad3655f 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -146,22 +146,24 @@ def xref_to_image(doc, xref) -> Union[Image.Image, None]: # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream try: pixmap = fitz.Pixmap(doc, xref) - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - array = normalize_channels(array) + array = convert_pixmap_to_array(pixmap) return Image.fromarray(array) except ValueError: logger.debug(f"Xref {xref} is invalid, skipping extraction ...") return -def normalize_channels(array: np.ndarray): - if not array.ndim == 3: - array = np.expand_dims(array, axis=-1) +def convert_pixmap_to_array(pixmap: fitz.fitz.Pixmap): + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + array = _normalize_channels(array) + return array - if array.shape[-1] == 4: + +def _normalize_channels(array: np.ndarray): + if array.shape[-1] == 1: + array = array[:, :, 0] + elif array.shape[-1] == 4: array = array[..., :3] - elif array.shape[-1] == 1: - array = np.concatenate([array, array, array], axis=-1) elif array.shape[-1] != 3: logger.warning(f"Unexpected image format: {array.shape}.") raise ValueError(f"Unexpected image format: {array.shape}.")