improve performance
- disable scanned page filter, since dropping these disables the computation of the images hash and the frontend OCR hint, which are both wanted - optimize image extraction by using arrays instead of byte streams for the conversion to PIL images
This commit is contained in:
parent
2385584dcb
commit
73f7491c8f
@ -1,5 +1,4 @@
|
||||
import atexit
|
||||
import io
|
||||
import json
|
||||
import traceback
|
||||
from _operator import itemgetter
|
||||
@ -9,6 +8,7 @@ from operator import itemgetter, truth
|
||||
from typing import Iterable, Iterator, List
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from funcy import merge, pluck, curry, compose, rcompose, remove
|
||||
|
||||
@ -111,7 +111,6 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
def filter_valid_metadata(metadata):
|
||||
yield from compose(
|
||||
filter_out_page_sized_images,
|
||||
filter_out_tiny_images,
|
||||
filter_invalid_metadata,
|
||||
)(metadata)
|
||||
@ -136,8 +135,9 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def xref_to_image(doc, xref) -> Image:
|
||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
|
||||
pixmap = fitz.Pixmap(doc, xref)
|
||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w)
|
||||
return Image.fromarray(array)
|
||||
|
||||
|
||||
def get_image_metadata(image_info):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user