refactoring
This commit is contained in:
parent
eff1bb4124
commit
2995d5ee48
@ -5,7 +5,7 @@ from _operator import itemgetter
|
||||
from functools import partial, lru_cache
|
||||
from itertools import chain, starmap, filterfalse
|
||||
from operator import itemgetter, truth
|
||||
from typing import Iterable, Iterator, List
|
||||
from typing import Iterable, Iterator, List, Union
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
@ -18,7 +18,7 @@ from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box
|
||||
from image_prediction.stitching.utils import validate_box
|
||||
from image_prediction.transformer.transformers.response import compute_geometric_quotient
|
||||
from image_prediction.utils import get_logger
|
||||
from image_prediction.utils.generic import lift
|
||||
@ -95,7 +95,7 @@ def get_images_on_page(doc, metadata):
|
||||
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
||||
return compose(
|
||||
list,
|
||||
partial(add_alpha_channel_info, doc, page),
|
||||
partial(add_alpha_channel_info, doc),
|
||||
filter_valid_metadata,
|
||||
get_metadata_for_images_on_page,
|
||||
)(page)
|
||||
@ -115,18 +115,18 @@ def filter_valid_metadata(metadata):
|
||||
# and giving the user the ability to reclassify false positives with a separate call.
|
||||
# filter_out_page_sized_images,
|
||||
filter_out_tiny_images,
|
||||
filter_invalid_metadata,
|
||||
filter_out_invalid_metadata,
|
||||
)(metadata)
|
||||
|
||||
|
||||
def filter_invalid_metadata(metadata):
|
||||
def invalid_box_filter(box):
|
||||
def filter_out_invalid_metadata(metadata):
|
||||
def __validate_box(box):
|
||||
try:
|
||||
return validate_box(box)
|
||||
except InvalidBox as e:
|
||||
logger.debug(f"Dropping invalid metadatum, reason: {e}")
|
||||
except InvalidBox as err:
|
||||
logger.debug(f"Dropping invalid metadatum, reason: {err}")
|
||||
|
||||
yield from keep(invalid_box_filter, metadata)
|
||||
yield from keep(__validate_box, metadata)
|
||||
|
||||
|
||||
def filter_out_page_sized_images(metadata):
|
||||
@ -143,18 +143,17 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def xref_to_image(doc, xref) -> Image:
|
||||
def xref_to_image(doc, xref) -> Union[Image.Image, None]:
|
||||
# NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
|
||||
try:
|
||||
pixmap = fitz.Pixmap(doc, xref)
|
||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
|
||||
array = array[:, :, 0] if array.shape[2] == 1 else array
|
||||
return Image.fromarray(array)
|
||||
except ValueError:
|
||||
# FIXME: Invalid xrefs occur here, this shouldn't be the case.
|
||||
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
|
||||
return
|
||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
|
||||
array = array[:, :, 0] if array.shape[2] == 1 else array
|
||||
return Image.fromarray(array)
|
||||
|
||||
|
||||
def get_image_metadata(image_info):
|
||||
@ -176,29 +175,19 @@ def get_image_metadata(image_info):
|
||||
}
|
||||
|
||||
|
||||
def validate_coords_and_passthrough(metadata):
|
||||
yield from map(validate_box_coords, metadata)
|
||||
|
||||
|
||||
def validate_size_and_passthrough(metadata):
|
||||
yield from map(validate_box_size, metadata)
|
||||
|
||||
|
||||
def add_page_metadata(page, metadata):
|
||||
yield from map(partial(merge, get_page_metadata(page)), metadata)
|
||||
|
||||
|
||||
def add_alpha_channel_info(doc, page, metadata):
|
||||
def add_alpha_channel_info(doc, metadata):
|
||||
def add_alpha_value_to_metadatum(metadatum):
|
||||
alpha = metadatum_to_alpha_value(metadatum)
|
||||
return {**metadatum, Info.ALPHA: alpha}
|
||||
|
||||
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
|
||||
xref_to_alpha = partial(has_alpha_channel, doc)
|
||||
page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
|
||||
alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
|
||||
page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
|
||||
metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
|
||||
|
||||
metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page)))
|
||||
|
||||
yield from metadata
|
||||
yield from map(add_alpha_value_to_metadatum, metadata)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user