diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 805adc8..27e0f33 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -111,9 +111,10 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( - # filter_out_page_sized_images, TODO: Link to concept for extraction toggling and reclassification endpoint - filter_out_tiny_images, - filter_invalid_metadata, + # filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint. + filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter + filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images, + # This should be done in filter_invalid_metadata. )(metadata) @@ -137,7 +138,11 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream - pixmap = fitz.Pixmap(doc, xref) + try: + pixmap = fitz.Pixmap(doc, xref) + except ValueError: + # FIXME: Invalid xrefs occur here, this shouldn't be the case. + return array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) array = array[:, :, 0] if array.shape[2] == 1 else array