diff --git a/config.yaml b/config.yaml index 6a6111a..9bfcaf1 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,6 @@ webserver: - host: $SERVER_HOST|"127.0.0.1" # webserver address - port: $SERVER_PORT|5000 # webserver port + host: $SERVER_HOST|"127.0.0.1" # Webserver address + port: $SERVER_PORT|5000 # Webserver port service: logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger diff --git a/image_prediction/exceptions.py b/image_prediction/exceptions.py index f03b42a..9c9ca49 100644 --- a/image_prediction/exceptions.py +++ b/image_prediction/exceptions.py @@ -36,3 +36,7 @@ class InvalidBox(Exception): class ParsingError(Exception): pass + + +class BadXref(ValueError): + pass diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index ad3655f..59ac8a5 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -13,7 +13,7 @@ from PIL import Image from funcy import merge, pluck, compose, rcompose, remove, keep from image_prediction.config import CONFIG -from image_prediction.exceptions import InvalidBox +from image_prediction.exceptions import InvalidBox, BadXref from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info diff --git a/image_prediction/locations.py b/image_prediction/locations.py index 1f14c1a..9374ace 100644 --- a/image_prediction/locations.py +++ b/image_prediction/locations.py @@ -3,15 +3,14 @@ from pathlib import Path MODULE_DIR = Path(__file__).resolve().parents[0] - PACKAGE_ROOT_DIR = MODULE_DIR.parents[0] CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml" - BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt" DATA_DIR = PACKAGE_ROOT_DIR / "data" - MLRUNS_DIR = str(DATA_DIR / "mlruns") -TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data" +TEST_DIR = PACKAGE_ROOT_DIR / "test" +TEST_DATA_DIR = TEST_DIR / "data" +TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc" diff --git a/image_prediction/utils/generic.py b/image_prediction/utils/generic.py index de71a5c..ffdf7b7 100644 --- a/image_prediction/utils/generic.py +++ b/image_prediction/utils/generic.py @@ -1,6 +1,15 @@ +from functools import wraps +from inspect import signature from itertools import starmap +from typing import Callable from funcy import iterate, first, curry, map +from pymonad.either import Left, Right, Either +from pymonad.tools import curry as pmcurry + +from image_prediction.utils import get_logger + +logger = get_logger() def until(cond, func, *args, **kwargs): @@ -13,3 +22,63 @@ def lift(fn): def starlift(fn): return curry(starmap)(fn) + + +def bottom(*args, **kwargs): + return False + + +def top(*args, **kwargs): + return True + + +def left(fn): + @wraps(fn) + def inner(x): + return Left(fn(x)) + + return inner + + +def right(fn): + @wraps(fn) + def inner(x): + return Right(fn(x)) + + return inner + + +def wrap_left(fn, success_condition=top, error_message=None) -> Callable: + return wrap_either(Left, Right, success_condition=success_condition, error_message=error_message)(fn) + + +def wrap_right(fn, success_condition=top, error_message=None) -> Callable: + return wrap_either(Right, Left, success_condition=success_condition, error_message=error_message)(fn) + + +def wrap_either(success_type, failure_type, success_condition=top, error_message=None) -> Callable: + @wraps(wrap_either) + def wrapper(fn) -> Callable: + + n_params = len(signature(fn).parameters) + + @pmcurry(n_params) + @wraps(fn) + def wrapper(*args, **kwargs) -> Either: + try: + result = fn(*args, **kwargs) + if success_condition(result): + return success_type(result) + else: + return failure_type({"error": error_message, "result": result}) + except Exception as err: + logger.error(err) + return failure_type({"error": error_message or err, "result": Void}) + + return wrapper + + return wrapper + + +class Void: + pass diff --git a/requirements.txt b/requirements.txt index da99202..3559e63 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,5 @@ pdf2image==1.16.0 frozendict==2.3.0 protobuf<=3.20.* prometheus-client==0.13.1 +fsspec==2022.11.0 +PyMonad==2.4.0 diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 0000000..3af0ccb --- /dev/null +++ b/test/.gitignore @@ -0,0 +1 @@ +/data diff --git a/test/data.dvc b/test/data.dvc new file mode 100644 index 0000000..c7040fe --- /dev/null +++ b/test/data.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir + size: 107332 + nfiles: 4 + path: data diff --git a/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf b/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf deleted file mode 100644 index 41f0d70..0000000 Binary files a/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9.pdf and /dev/null differ diff --git a/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json b/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json deleted file mode 100644 index 1a1b3f5..0000000 --- a/test/data/f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json +++ /dev/null @@ -1,44 +0,0 @@ -[ - { - "classification": { - "label": "formula", - "probabilities": { - "formula": 1.0, - "logo": 0.0, - "other": 0.0, - "signature": 0.0 - } - }, - "representation": "FFFEF0C7033648170F3EFFFFF", - "position": { - "x1": 321, - "x2": 515, - "y1": 348, - "y2": 542, - "pageNumber": 2 - }, - "geometry": { - "width": 194, - "height": 194 - }, - "alpha": false, - "filters": { - "geometry": { - "imageSize": { - "quotient": 0.2741, - "tooLarge": false, - "tooSmall": false - }, - "imageFormat": { - "quotient": 1.0, - "tooTall": false, - "tooWide": false - } - }, - "probability": { - "unconfident": false - }, - "allPassed": true - } - } -] \ No newline at end of file diff --git a/test/data/stitching_with_tolerance.json b/test/data/stitching_with_tolerance.json deleted file mode 100644 index f7f1049..0000000 --- a/test/data/stitching_with_tolerance.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "input": [ - { - "width": 100, - "height": 8, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 0, - "y1": 0, - "x2": 100, - "y2": 8 - }, - { - "width": 100, - "height": 9, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 0, - "y1": 9, - "x2": 100, - "y2": 18 - }, - { - "width": 100, - "height": 35, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 0, - "y1": 18, - "x2": 100, - "y2": 53 - }, - { - "width": 47, - "height": 46, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 0, - "y1": 54, - "x2": 47, - "y2": 100 - }, - { - "width": 31, - "height": 46, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 48, - "y1": 54, - "x2": 79, - "y2": 100 - }, - { - "width": 20, - "height": 19, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 80, - "y1": 54, - "x2": 100, - "y2": 73 - }, - { - "width": 20, - "height": 27, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 80, - "y1": 73, - "x2": 100, - "y2": 100 - } - ], - "target": { - "width": 100, - "height": 100, - "page_idx": 0, - "page_width": 100, - "page_height": 100, - "x1": 0, - "y1": 0, - "x2": 100, - "y2": 100 - } -} diff --git a/test/fixtures/input.py b/test/fixtures/input.py index b02f414..2054df6 100644 --- a/test/fixtures/input.py +++ b/test/fixtures/input.py @@ -1,7 +1,21 @@ import numpy as np import pytest +from dvc.repo import Repo + +from image_prediction.locations import PACKAGE_ROOT_DIR, TEST_DATA_DIR_DVC +from image_prediction.utils import get_logger + +logger = get_logger() @pytest.fixture def input_batch(batch_size, input_size): return np.random.random_sample(size=(batch_size, *input_size)) + + +@pytest.fixture(scope="session") +def dvc_test_data(): + logger.info("Pulling data with DVC...") + # noinspection PyCallingNonCallable + Repo(PACKAGE_ROOT_DIR).pull(targets=[str(TEST_DATA_DIR_DVC)]) + logger.info("Finished pulling data.") diff --git a/test/fixtures/pdf.py b/test/fixtures/pdf.py index 7353917..0991bbe 100644 --- a/test/fixtures/pdf.py +++ b/test/fixtures/pdf.py @@ -4,7 +4,7 @@ import fpdf import pytest from image_prediction.locations import TEST_DATA_DIR -from test.utils.generation.pdf import add_image, pdf_stream +from test.utils.generation.pdf import add_image, pdf_stream, stream_pdf_bytes @pytest.fixture @@ -18,6 +18,10 @@ def pdf(image_metadata_pairs): @pytest.fixture -def real_pdf(): - with open(os.path.join(TEST_DATA_DIR, "f2dc689ca794fccb8cd38b95f2bf6ba9.pdf"), "rb") as f: - yield f.read() +def real_pdf(dvc_test_data): + yield from stream_pdf_bytes(TEST_DATA_DIR / "f2dc689ca794fccb8cd38b95f2bf6ba9.pdf") + + +@pytest.fixture +def bad_xref_pdf(dvc_test_data): + yield from stream_pdf_bytes(TEST_DATA_DIR / "bad_xref.pdf") diff --git a/test/fixtures/target.py b/test/fixtures/target.py index 23f23bd..1f111fc 100644 --- a/test/fixtures/target.py +++ b/test/fixtures/target.py @@ -87,7 +87,7 @@ def expected_predictions_mapped_and_formatted(expected_predictions_mapped): @pytest.fixture -def real_expected_service_response(): +def real_expected_service_response(dvc_test_data): with open(os.path.join(TEST_DATA_DIR, "f2dc689ca794fccb8cd38b95f2bf6ba9_predictions.json"), "r") as f: yield json.load(f) diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index 8e6916c..92a705a 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -9,7 +9,13 @@ from funcy import first, rest from image_prediction.extraction import extract_images_from_pdf from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel, get_image_infos +from image_prediction.image_extractor.extractors.parsable import ( + extract_pages, + has_alpha_channel, + get_image_infos, + extract_valid_metadata, + xref_to_image, +) from image_prediction.info import Info from test.utils.comparison import metadata_equal, image_sets_equal from test.utils.generation.pdf import add_image, pdf_stream @@ -75,3 +81,12 @@ def test_has_alpha_channel(base_patch_metadata, suffix, mode): assert not list(rest(xrefs)) doc.close() + + +def test_bad_xref_handling(bad_xref_pdf, dvc_test_data): + + doc = fitz.Document(stream=bad_xref_pdf) + metadata = extract_valid_metadata(doc, first(doc)) + xref = first(metadata)[Info.XREF] + + assert not xref_to_image(doc, xref) diff --git a/test/unit_tests/image_stitching_test.py b/test/unit_tests/image_stitching_test.py index edf7923..3762036 100644 --- a/test/unit_tests/image_stitching_test.py +++ b/test/unit_tests/image_stitching_test.py @@ -60,10 +60,10 @@ def test_image_stitcher(patch_image_metadata_pairs, base_patch_metadata, base_pa assert images_equal(pair_stitched.image.resize((10, 10)), base_patch_image.resize((10, 10)), atol=0.4) -def test_image_stitcher_with_gaps_must_succeed(): +def test_image_stitcher_with_gaps_must_succeed(dvc_test_data): from image_prediction.locations import TEST_DATA_DIR - with open(os.path.join(TEST_DATA_DIR, "stitching_with_tolerance.json")) as f: + with open(TEST_DATA_DIR / "stitching_with_tolerance.json") as f: patches_metadata, base_patch_metadata = itemgetter("input", "target")(ReverseEnumFormatter(Info)(json.load(f))) images = map(gray_image_from_metadata, patches_metadata) diff --git a/test/utils/generation/pdf.py b/test/utils/generation/pdf.py index 852647e..111a6d4 100644 --- a/test/utils/generation/pdf.py +++ b/test/utils/generation/pdf.py @@ -28,3 +28,8 @@ def add_image_to_last_page(pdf: fpdf.fpdf.FPDF, image_metadata_pair, suffix): with tempfile.NamedTemporaryFile(suffix=f".{suffix}") as temp_image: image.save(temp_image.name) pdf.image(temp_image.name, x=x, y=y, w=w, h=h, type=suffix) + + +def stream_pdf_bytes(path: str): + with open(path, "rb") as f: + yield f.read()