From 73f7491c8f9c01967f95b9e0210779878f32f99c Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 2 Feb 2023 13:36:58 +0100 Subject: [PATCH] improve performance - disable scanned page filter, since dropping these disables the computation of the images hash and the frontend OCR hint, which are both wanted - optimize image extraction by using arrays instead of byte streams for the conversion to PIL images --- image_prediction/image_extractor/extractors/parsable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index ddb392d..b65e3d3 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -1,5 +1,4 @@ import atexit -import io import json import traceback from _operator import itemgetter @@ -9,6 +8,7 @@ from operator import itemgetter, truth from typing import Iterable, Iterator, List import fitz +import numpy as np from PIL import Image from funcy import merge, pluck, curry, compose, rcompose, remove @@ -111,7 +111,6 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( - filter_out_page_sized_images, filter_out_tiny_images, filter_invalid_metadata, )(metadata) @@ -136,8 +135,9 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: - maybe_image = load_image_handle_from_xref(doc, xref) - return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None + pixmap = fitz.Pixmap(doc, xref) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w) + return Image.fromarray(array) def get_image_metadata(image_info):