Matthias Bisping 5cdf93b923 Pull request #39: RED-6084 Improve image extraction speed
Merge in RR/image-prediction from RED-6084-adhoc-scanned-pages-filtering-refactoring to master

Squashed commit of the following:

commit bd6d83e7363b1c1993babcceb434110a6312c645
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Thu Feb 9 16:08:25 2023 +0100

    Tweak logging

commit 55bdd48d2a3462a8b4a6b7194c4a46b21d74c455
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Thu Feb 9 15:47:31 2023 +0100

    Update dependencies

commit 970275b25708c05e4fbe78b52aa70d791d5ff17a
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Thu Feb 9 15:35:37 2023 +0100

    Refactoring

    Make alpha channel check monadic to streamline error handling

commit e99e97e23fd8ce16f9a421d3e5442fccacf71ead
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Feb 7 14:32:29 2023 +0100

    Refactoring

    - Rename
    - Refactor image extraction functions

commit 76b1b0ca2401495ec03ba2b6483091b52732eb81
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Feb 7 11:55:30 2023 +0100

    Refactoring

commit cb1c461049d7c43ec340302f466447da9f95a499
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Feb 7 11:44:01 2023 +0100

    Refactoring

commit 092069221a85ac7ac19bf838dcbc7ab1fde1e12b
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Feb 7 10:18:53 2023 +0100

    Add to-do

commit 3cea4dad2d9703b8c79ddeb740b66a3b8255bb2a
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Feb 7 10:11:35 2023 +0100

    Refactoring

    - Rename
    - Add typehints everywhere

commit 865e0819a14c420bc2edff454d41092c11c019a4
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 19:38:57 2023 +0100

    Add type explanation

commit 01d3d5d33f1ccb05aea1cec1d1577572b1a4deaa
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 19:37:49 2023 +0100

    Formatting

commit dffe1c18fc3a322a6b08890d4438844e8122faaf
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 19:34:13 2023 +0100

    [WIP] Either refactoring

    Add alternative formulation for monadic chain

commit 066cf17add404a313520cd794c06e3264cf971c9
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 18:40:30 2023 +0100

    [WIP] Either refactoring

commit f53f0fea298cdab88deb090af328b34d37e0198e
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 18:18:34 2023 +0100

    [WIP] Either refactoring

    Propagate error and metadata

commit 274a5f56d4fcb9c67fac5cf43e9412ec1ab5179e
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 17:51:35 2023 +0100

    [WIP] Either refactoring

    Fix test assertion

commit 3235a857f6e418e50484cbfff152b0f63efb2f53
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 16:57:31 2023 +0100

    [WIP] Either-refactoring

    Replace Maybe with Either to allow passing on error information or
    metadata which otherwise get sucked up by Nothing.

commit 89989543d87490f8b20a0a76055605d34345e8f4
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 16:12:40 2023 +0100

    [WIP] Monadic refactoring

    Integrate image validation step into monadic chain.

    At the moment we lost the error information through this. Refactoring to
    Either monad can bring it back.

commit 022bd4856a51aa085df5fe983fd77b99b53d594c
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 15:16:41 2023 +0100

    [WIP] Monadic refactoring

commit ca3898cb539607c8c3dd01c57e60211a5fea8a7d
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 15:10:34 2023 +0100

    [WIP] Monadic refactoring

commit d8f37bed5cbd6bdd2a0b52bae46fcdbb50f9dff2
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 15:09:51 2023 +0100

    [WIP] Monadic refactoring

commit 906fee0e5df051f38076aa1d2725e52a182ade13
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Feb 6 15:03:35 2023 +0100

    [WIP] Monadic refactoring

... and 35 more commits
2023-02-10 08:33:13 +01:00

247 lines
8.7 KiB
Python

import json
import os
from copy import deepcopy
from functools import partial
from itertools import starmap, repeat
from operator import itemgetter
from typing import List
import fpdf
import pdf2image
import pytest
from funcy import juxt, one, first
from image_prediction.formatter.formatters.enum import ReverseEnumFormatter
from image_prediction.image_extractor.extractor import ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.grouping import group_by_coordinate
from image_prediction.stitching.merging import (
merge_metadata_horizontally,
merge_metadata_vertically,
merge_pair_horizontally,
merge_pair_vertically,
concat_images_horizontally,
concat_images_vertically,
merge_group_horizontally,
merge_group_vertically,
)
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import (
make_coord_getter,
make_length_getter,
)
from test.utils.comparison import images_equal
from test.utils.generation.image import random_single_color_image_from_metadata, gray_image_from_metadata
from test.utils.generation.pdf import add_image
from test.utils.stitching import BoxSplitter
x1_getter, y1_getter, x2_getter, y2_getter = map(make_coord_getter, ("x1", "y1", "x2", "y2"))
width_getter, height_getter = map(make_length_getter, ("width", "height"))
def test_group_by_coordinate_exact():
pairs = [(0, 1), (0, 3), (1, 4), (1, 4), (1, 2), (3, 3)]
pairs_grouped = list(group_by_coordinate(pairs, itemgetter(0), tolerance=0))
assert pairs_grouped == [[(0, 1), (0, 3)], [(1, 4), (1, 4), (1, 2)], [(3, 3)]]
def test_group_by_coordinate_fuzzy():
pairs = [(0, 1), (1, 3), (1, 4), (2, 4), (2, 2), (3, 3)]
pairs_grouped = list(group_by_coordinate(pairs, itemgetter(0), tolerance=1))
assert pairs_grouped == [[(0, 1), (1, 3), (1, 4)], [(2, 4), (2, 2), (3, 3)]]
def test_image_stitcher(patch_image_metadata_pairs, base_patch_metadata, base_patch_image):
pairs_stitched = stitch_pairs(patch_image_metadata_pairs)
pair_stitched = first(pairs_stitched)
assert len(pairs_stitched) == 1
assert pair_stitched.metadata == base_patch_metadata
assert images_equal(pair_stitched.image.resize((10, 10)), base_patch_image.resize((10, 10)), atol=0.4)
def test_image_stitcher_with_gaps_must_succeed(dvc_test_data):
from image_prediction.locations import TEST_DATA_DIR
with open(TEST_DATA_DIR / "stitching_with_tolerance.json") as f:
patches_metadata, base_patch_metadata = itemgetter("input", "target")(ReverseEnumFormatter(Info)(json.load(f)))
images = map(gray_image_from_metadata, patches_metadata)
patch_image_metadata_pairs = list(starmap(ImageMetadataPair, zip(images, patches_metadata)))
pairs_stitched = stitch_pairs(patch_image_metadata_pairs, tolerance=7)
assert len(pairs_stitched) == 1
pair_stitched = first(pairs_stitched)
assert pair_stitched.metadata == base_patch_metadata
@pytest.mark.parametrize("noise", [(0, 2)])
@pytest.mark.parametrize("split_count", [5])
@pytest.mark.parametrize("width", [100])
@pytest.mark.parametrize("height", [100])
@pytest.mark.parametrize("page_width", [100])
@pytest.mark.parametrize("page_height", [100])
@pytest.mark.parametrize("execution_number", range(100))
@pytest.mark.xfail(reason="Does not always succeed due to locally maximizing merging logic.")
def test_image_stitcher_with_gaps_can_fail(patch_image_metadata_pairs, base_patch_metadata, execution_number):
pairs_stitched = stitch_pairs(patch_image_metadata_pairs, tolerance=4)
assert len(pairs_stitched) == 1 and first(pairs_stitched).metadata == base_patch_metadata
def test_merge_group_horizontally(horizontal_merge_test_pairs):
pr1, pr2, pr_merged_expected = horizontal_merge_test_pairs
prs_merged = merge_group_horizontally([pr1, pr2])
assert len(prs_merged) == 1
assert pair_equal(prs_merged[0], pr_merged_expected)
mdat3 = deepcopy(pr2.metadata)
mdat3[Info.HEIGHT] += 30
mdat3[Info.Y2] += 30
im3 = gray_image_from_metadata(mdat3)
pr3 = ImageMetadataPair(im3, mdat3)
prs_merged = merge_group_horizontally([pr1, pr2, pr3])
assert len(prs_merged) == 2
assert one(partial(pair_equal, pr_merged_expected), prs_merged)
def test_merge_group_vertically(vertical_merge_test_pairs):
pr1, pr2, pr_merged_expected = vertical_merge_test_pairs
prs_merged = merge_group_vertically([pr1, pr2])
assert len(prs_merged) == 1
assert pair_equal(prs_merged[0], pr_merged_expected)
mdat3 = deepcopy(pr2.metadata)
mdat3[Info.WIDTH] += 30
mdat3[Info.X2] += 30
im3 = gray_image_from_metadata(mdat3)
pr3 = ImageMetadataPair(im3, mdat3)
prs_merged = merge_group_vertically([pr1, pr2, pr3])
assert len(prs_merged) == 2
assert one(partial(pair_equal, pr_merged_expected), prs_merged)
def pair_equal(pr1, pr2):
return pr1.metadata == pr2.metadata and images_equal(pr1.image, pr2.image)
def test_merge_pairs_horizontally(horizontal_merge_test_pairs):
pr1, pr2, pr_merged_expected = horizontal_merge_test_pairs
pr_merged = merge_pair_horizontally(pr1, pr2)
assert pair_equal(pr_merged, pr_merged_expected)
def test_merge_pairs_vertically(vertical_merge_test_pairs):
pr1, pr2, pr_merged_expected = vertical_merge_test_pairs
pr_merged = merge_pair_vertically(pr1, pr2)
assert pair_equal(pr_merged, pr_merged_expected)
@pytest.fixture
def horizontal_merge_test_pairs(horizontal_merge_test_metadata):
images = map(gray_image_from_metadata, horizontal_merge_test_metadata)
return list(starmap(ImageMetadataPair, zip(images, horizontal_merge_test_metadata)))
@pytest.fixture
def vertical_merge_test_pairs(vertical_merge_test_metadata):
images = map(gray_image_from_metadata, vertical_merge_test_metadata)
return list(starmap(ImageMetadataPair, zip(images, vertical_merge_test_metadata)))
def test_merge_metadata_horizontally(horizontal_merge_test_metadata):
mdat1, mdat2, mdat_merged = horizontal_merge_test_metadata
assert merge_metadata_horizontally(mdat1, mdat2) == mdat_merged
def test_merge_metadata_vertically(vertical_merge_test_metadata):
mdat1, mdat2, mdat_merged = vertical_merge_test_metadata
assert merge_metadata_vertically(mdat1, mdat2) == mdat_merged
@pytest.fixture
def horizontal_merge_test_metadata(merge_test_metadata):
mdat1, mdat2, mdat_merged = merge_test_metadata
mdat2[Info.X1] = mdat1[Info.X2]
mdat2[Info.X2] = mdat2[Info.X1] + mdat2[Info.WIDTH]
mdat_merged.update({Info.WIDTH: mdat1[Info.WIDTH] + mdat2[Info.WIDTH], Info.X2: mdat2[Info.X2]})
return mdat1, mdat2, mdat_merged
@pytest.fixture
def vertical_merge_test_metadata(merge_test_metadata):
mdat1, mdat2, mdat_merged = merge_test_metadata
mdat2[Info.Y1] = mdat1[Info.Y2]
mdat2[Info.Y2] = mdat2[Info.Y1] + mdat2[Info.HEIGHT]
mdat_merged.update({Info.HEIGHT: mdat1[Info.HEIGHT] + mdat2[Info.HEIGHT], Info.Y2: mdat2[Info.Y2]})
return mdat1, mdat2, mdat_merged
@pytest.fixture
def merge_test_metadata(base_patch_metadata):
return juxt(*repeat(deepcopy, 3))(base_patch_metadata)
@pytest.fixture
def base_patch_image(stitch_test_pdf):
return pdf2image.convert_from_bytes(stitch_test_pdf)[0]
def test_concat_images_horizontally(horizontal_merge_test_metadata):
mdat1, mdat2, mdat_merged = horizontal_merge_test_metadata
im1, im2, im_merged_expected = map(gray_image_from_metadata, [mdat1, mdat2, mdat_merged])
im_merged = concat_images_horizontally(im1, im2, mdat_merged)
assert im_merged.size == im_merged_expected.size
assert images_equal(im_merged, im_merged_expected)
def test_concat_images_vertically(vertical_merge_test_metadata):
mdat1, mdat2, mdat_merged = vertical_merge_test_metadata
im1, im2, im_merged_expected = map(gray_image_from_metadata, [mdat1, mdat2, mdat_merged])
im_merged = concat_images_vertically(im1, im2, mdat_merged)
assert im_merged.size == im_merged_expected.size
assert images_equal(im_merged, im_merged_expected)
@pytest.fixture
def stitch_test_pdf(patch_image_metadata_pairs, width, height):
pdf = fpdf.FPDF(unit="pt", format=(width, height))
for pair in patch_image_metadata_pairs:
add_image(pdf, pair)
return pdf.output(dest="S").encode("latin1")
@pytest.fixture
def patch_image_metadata_pairs(patches_metadata) -> List[ImageMetadataPair]:
images = map(random_single_color_image_from_metadata, patches_metadata)
return list(starmap(ImageMetadataPair, zip(images, patches_metadata)))
@pytest.fixture
def patches_metadata(base_patch_metadata, noise, split_count):
patches_metadata = list(BoxSplitter(noise).split_box(base_patch_metadata, split_count))
return patches_metadata
@pytest.fixture(params=[(0, 0)])
def noise(request):
return request.param
@pytest.fixture(params=[5])
def split_count(request):
return request.param