diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 970bc03..a0c24c1 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -47,7 +47,7 @@ class ParsablePDFImageExtractor(ImageExtractor): # FIXME: Heuristic filtering shouldn't take place here, # consider introducing a preprocessing step before extracting images, - # e.g. together with a image validation step for broken images. + # e.g. together with an image validation step for broken images. @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): images = get_images_on_page(self.doc, page) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py index d3f59b1..d6cb2f6 100644 --- a/image_prediction/image_extractor/filters.py +++ b/image_prediction/image_extractor/filters.py @@ -16,17 +16,17 @@ def filter_scanned_pages(page_processor: Callable): """Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor. This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline). A scanned page is defined by - - having only one image on a page + - having only one image on a page and - that image having an image_to_page ratio greater than the allowed max value - found in the CONFIG.filters.image_to_page_quotient.max""" + defined in CONFIG.filters.image_to_page_quotient.max""" - def inner(self: ImageExtractor, page: fitz.fitz.Page): + def inner(extractor: ImageExtractor, page: fitz.fitz.Page): metadata = get_metadata(page) if is_a_scanned_page(metadata): logger.debug(f"Page {page.number} won't be processed since it is a scanned page.") yield from [] else: - yield from page_processor(self, page) + yield from page_processor(extractor, page) logger.info(f"Extracting pages with filtering scanned pages...") return inner