add ad hoc logic for bad xref handling
This commit is contained in:
parent
94652aafe4
commit
978f48e8f9
@ -111,9 +111,10 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
def filter_valid_metadata(metadata):
|
||||
yield from compose(
|
||||
# filter_out_page_sized_images, TODO: Link to concept for extraction toggling and reclassification endpoint
|
||||
filter_out_tiny_images,
|
||||
filter_invalid_metadata,
|
||||
# filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint.
|
||||
filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter
|
||||
filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images,
|
||||
# This should be done in filter_invalid_metadata.
|
||||
)(metadata)
|
||||
|
||||
|
||||
@ -137,7 +138,11 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||
@lru_cache(maxsize=None)
|
||||
def xref_to_image(doc, xref) -> Image:
|
||||
# NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
|
||||
pixmap = fitz.Pixmap(doc, xref)
|
||||
try:
|
||||
pixmap = fitz.Pixmap(doc, xref)
|
||||
except ValueError:
|
||||
# FIXME: Invalid xrefs occur here, this shouldn't be the case.
|
||||
return
|
||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
|
||||
array = array[:, :, 0] if array.shape[2] == 1 else array
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user