improve the readability of variable names and docstrings
This commit is contained in:
parent
368c54a8be
commit
876260f403
@ -47,7 +47,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
|
||||
# FIXME: Heuristic filtering shouldn't take place here,
|
||||
# consider introducing a preprocessing step before extracting images,
|
||||
# e.g. together with a image validation step for broken images.
|
||||
# e.g. together with an image validation step for broken images.
|
||||
@filter_scanned_pages
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
images = get_images_on_page(self.doc, page)
|
||||
|
||||
@ -16,17 +16,17 @@ def filter_scanned_pages(page_processor: Callable):
|
||||
"""Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor.
|
||||
This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline).
|
||||
A scanned page is defined by
|
||||
- having only one image on a page
|
||||
- having only one image on a page and
|
||||
- that image having an image_to_page ratio greater than the allowed max value
|
||||
found in the CONFIG.filters.image_to_page_quotient.max"""
|
||||
defined in CONFIG.filters.image_to_page_quotient.max"""
|
||||
|
||||
def inner(self: ImageExtractor, page: fitz.fitz.Page):
|
||||
def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
|
||||
metadata = get_metadata(page)
|
||||
if is_a_scanned_page(metadata):
|
||||
logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
|
||||
yield from []
|
||||
else:
|
||||
yield from page_processor(self, page)
|
||||
yield from page_processor(extractor, page)
|
||||
|
||||
logger.info(f"Extracting pages with filtering scanned pages...")
|
||||
return inner
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user