From 978f48e8f9b1167f5b483b775869e55d3c6ba56f Mon Sep 17 00:00:00 2001
From: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu, 2 Feb 2023 15:39:44 +0100
Subject: [PATCH] add ad hoc logic for bad xref handling

---
 .../image_extractor/extractors/parsable.py          | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py
index 805adc8..27e0f33 100644
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@@ -111,9 +111,10 @@ def get_metadata_for_images_on_page(page: fitz.Page):
 
 def filter_valid_metadata(metadata):
     yield from compose(
-        # filter_out_page_sized_images, TODO: Link to concept for extraction toggling and reclassification endpoint
-        filter_out_tiny_images,
-        filter_invalid_metadata,
+        # filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint.
+        filter_invalid_metadata,  # TODO: this doesn't filter but raises if images are invalid, maybe should filter
+        filter_out_tiny_images,  # FIXME: this implicitly filters invalid metadata, e.g. for zero height images,
+                                 #  This should be done in filter_invalid_metadata.
     )(metadata)
 
 
@@ -137,7 +138,11 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
 @lru_cache(maxsize=None)
 def xref_to_image(doc, xref) -> Image:
     # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
-    pixmap = fitz.Pixmap(doc, xref)
+    try:
+        pixmap = fitz.Pixmap(doc, xref)
+    except ValueError:
+        # FIXME: Invalid xrefs occur here, this shouldn't be the case.
+        return
     array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
     # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
     array = array[:, :, 0] if array.shape[2] == 1 else array