beautify

2023-02-02 15:26:33 +01:00 · 2023-02-02 15:26:33 +01:00 · 94652aafe4
commit 94652aafe4
parent c4416636c0
1 changed files with 3 additions and 0 deletions
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -111,6 +111,7 @@ def get_metadata_for_images_on_page(page: fitz.Page):

 def filter_valid_metadata(metadata):
    yield from compose(
+        # filter_out_page_sized_images, TODO: Link to concept for extraction toggling and reclassification endpoint
        filter_out_tiny_images,
        filter_invalid_metadata,
    )(metadata)
@ -135,8 +136,10 @@ def get_image_infos(page: fitz.Page) -> List[dict]:

@lru_cache(maxsize=None)
 def xref_to_image(doc, xref) -> Image:
+    # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
    pixmap = fitz.Pixmap(doc, xref)
    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+    # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
    array = array[:, :, 0] if array.shape[2] == 1 else array
    return Image.fromarray(array)