Compare commits
2 Commits
master
...
release/1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee48f141f8 | ||
|
|
c03913e088 |
@ -1,14 +1,17 @@
|
|||||||
import atexit
|
import atexit
|
||||||
import io
|
import io
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
from functools import partial, lru_cache
|
from functools import partial, lru_cache
|
||||||
from itertools import chain, starmap, filterfalse
|
from itertools import chain, starmap, filterfalse
|
||||||
from operator import itemgetter
|
from operator import itemgetter, truth
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import rcompose, merge, pluck, curry, compose
|
from funcy import rcompose, merge, pluck, curry, compose
|
||||||
|
|
||||||
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
from image_prediction.info import Info
|
from image_prediction.info import Info
|
||||||
from image_prediction.stitching.stitching import stitch_pairs
|
from image_prediction.stitching.stitching import stitch_pairs
|
||||||
@ -46,10 +49,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
clear_caches()
|
clear_caches()
|
||||||
|
|
||||||
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
||||||
|
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
||||||
|
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
||||||
|
# again for the formatting step.
|
||||||
|
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
||||||
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||||
|
|
||||||
yield from image_metadata_pairs
|
yield from image_metadata_pairs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __filter_valid_images(image_metadata_pairs):
|
||||||
|
def validate(image: Image.Image, metadata: dict):
|
||||||
|
try:
|
||||||
|
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution
|
||||||
|
image.resize((100, 100)).convert("RGB")
|
||||||
|
return ImageMetadataPair(image, metadata)
|
||||||
|
except (OSError, Exception) as err:
|
||||||
|
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
|
||||||
|
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return filter(truth, starmap(validate, image_metadata_pairs))
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(doc, page_range):
|
def extract_pages(doc, page_range):
|
||||||
page_range = range(page_range.start + 1, page_range.stop + 1)
|
page_range = range(page_range.start + 1, page_range.stop + 1)
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from image_prediction.utils.banner import show_banner
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
def predict(pdf):
|
def predict(pdf):
|
||||||
# Keras service_estimator.predict stalls when service_estimator was loaded in different process;
|
# Keras service_estimator.predict stalls when service_estimator was loaded in different process;
|
||||||
# therefore, we re-load the model (part of the pipeline) every time we process a new document.
|
# therefore, we re-load the model (part of the pipeline) every time we process a new document.
|
||||||
|
|||||||
@ -17,7 +17,7 @@ pytest_plugins = [
|
|||||||
"test.fixtures.parameters",
|
"test.fixtures.parameters",
|
||||||
"test.fixtures.pdf",
|
"test.fixtures.pdf",
|
||||||
"test.fixtures.target",
|
"test.fixtures.target",
|
||||||
"test.unit_tests.image_stitching_test"
|
"test.unit_tests.image_stitching_test",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,7 @@ from funcy import rcompose, chunks
|
|||||||
|
|
||||||
|
|
||||||
def test_rcompose():
|
def test_rcompose():
|
||||||
f = rcompose(lambda x: x ** 2, str, lambda x: x * 2)
|
f = rcompose(lambda x: x**2, str, lambda x: x * 2)
|
||||||
assert f(3) == "99"
|
assert f(3) == "99"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user