improve the readability of variable names and docstrings

This commit is contained in:
Julius Unverfehrt 2023-02-01 10:08:36 +01:00
parent 368c54a8be
commit 876260f403
2 changed files with 5 additions and 5 deletions

View File

@ -47,7 +47,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
# FIXME: Heuristic filtering shouldn't take place here,
# consider introducing a preprocessing step before extracting images,
# e.g. together with a image validation step for broken images.
# e.g. together with an image validation step for broken images.
@filter_scanned_pages
def __process_images_on_page(self, page: fitz.fitz.Page):
images = get_images_on_page(self.doc, page)

View File

@ -16,17 +16,17 @@ def filter_scanned_pages(page_processor: Callable):
"""Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor.
This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline).
A scanned page is defined by
- having only one image on a page
- having only one image on a page and
- that image having an image_to_page ratio greater than the allowed max value
found in the CONFIG.filters.image_to_page_quotient.max"""
defined in CONFIG.filters.image_to_page_quotient.max"""
def inner(self: ImageExtractor, page: fitz.fitz.Page):
def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
metadata = get_metadata(page)
if is_a_scanned_page(metadata):
logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
yield from []
else:
yield from page_processor(self, page)
yield from page_processor(extractor, page)
logger.info(f"Extracting pages with filtering scanned pages...")
return inner