Pull request #19: Refactor evaluate
Merge in RR/cv-analysis from refactor-evaluate to master
Squashed commit of the following:
commit cde03a492452610322f8b7d3eb804a51afb76d81
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 22 12:37:36 2022 +0200
add optional show analysis metadata dict
commit fb8bb9e2afa7767f2560f865516295be65f97f20
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 22 12:13:18 2022 +0200
add script to evaluate runtime per page for all cv-analysis operations for multiple PDFs
commit 721e823e2ec38aae3fea51d01e2135fc8f228d94
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 22 10:30:31 2022 +0200
refactor
commit a453753cfa477e162e5902ce191ded61cb678337
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 22 10:19:24 2022 +0200
add logic to transform result coordinates accordingly to page rotation, update annotation script to use this logic
commit 71c09758d0fb763a2c38c6871e1d9bf51f2e7c41
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jul 21 15:57:49 2022 +0200
introduce pipeline for image conversion, analysis and result formatting
commit aef252a41b9658dd0c4f55aa2d9f84de933586e0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jul 21 15:57:38 2022 +0200
introduce pipeline for image conversion, analysis and result formatting
This commit is contained in:
parent
e7b28f5bda
commit
a871fa3bd3
64
cv_analysis/server/pipeline.py
Normal file
64
cv_analysis/server/pipeline.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
from functools import partial
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
from funcy import lmap
|
||||||
|
|
||||||
|
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
|
||||||
|
from cv_analysis.layout_parsing import parse_layout
|
||||||
|
from cv_analysis.server.rotate import rotate_rectangle
|
||||||
|
from cv_analysis.table_parsing import parse_tables
|
||||||
|
from cv_analysis.utils.logging import get_logger
|
||||||
|
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
|
||||||
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
|
||||||
|
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
|
||||||
|
The pipeline returns a Generator of dicts containing page information and the analysis results.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
Convert PDF to Arrays and page information
|
||||||
|
Analise pages, get list of bboxes per page (e.g. table cells)
|
||||||
|
Convert pixel values to inches
|
||||||
|
Rotate results if page is rotated
|
||||||
|
Format results to stream of dictionaries
|
||||||
|
"""
|
||||||
|
|
||||||
|
def pipeline(pdf: bytes, index=None):
|
||||||
|
image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi)
|
||||||
|
results = map(image_metadata_pair_to_results, image_metadata_pairs)
|
||||||
|
results_filtered = filter(lambda x: x["bboxes"], results)
|
||||||
|
return results_filtered
|
||||||
|
|
||||||
|
def image_metadata_pair_to_results(image_metadata_pair):
|
||||||
|
rectangles = analysis_fn(image_metadata_pair.image)
|
||||||
|
rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles)
|
||||||
|
if image_metadata_pair.metadata["rotation"] != 0:
|
||||||
|
rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata)
|
||||||
|
rectangles = map(rotate_rectangle_fn, rectangles)
|
||||||
|
bboxes = lmap(lambda x: x.json_xyxy(), rectangles)
|
||||||
|
return {**image_metadata_pair.metadata, "bboxes": bboxes}
|
||||||
|
|
||||||
|
return pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def get_analysis_fn(analysis_type):
|
||||||
|
if analysis_type == "table":
|
||||||
|
return parse_tables
|
||||||
|
elif analysis_type == "layout":
|
||||||
|
return parse_layout
|
||||||
|
elif analysis_type == "figure":
|
||||||
|
return make_figure_detection_pipeline()
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def pixel_rect_to_inches_rect(rect, dpi):
|
||||||
|
def convert_pixel_to_inch(pixel):
|
||||||
|
return pixel / dpi * 72
|
||||||
|
|
||||||
|
bbox = rect.x1, rect.y1, rect.x2, rect.y2
|
||||||
|
bbox_inches = tuple(map(convert_pixel_to_inch, bbox))
|
||||||
|
return Rectangle.from_xyxy(bbox_inches, discrete=False)
|
||||||
@ -1,35 +1,25 @@
|
|||||||
from _operator import itemgetter
|
from _operator import itemgetter
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
|
|
||||||
def make_formatter(dpi, page_size, rotation):
|
def rotate_rectangle(rectangle, metadata):
|
||||||
|
width, height, rotation = itemgetter("width", "height", "rotation")(metadata)
|
||||||
rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation
|
rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation
|
||||||
|
|
||||||
def format_(key2pixel):
|
if rotation in [1, 3]:
|
||||||
convert = partial(convert_pixel_to_inch, dpi=dpi)
|
width, height = height, width
|
||||||
x, y, w, h = map(convert, itemgetter("x", "y", "width", "height")(key2pixel))
|
|
||||||
x1, y1 = x + w, y + h
|
|
||||||
matrix = np.vstack([[x, y], [x1, y1]]).T
|
|
||||||
new_matrix = rotate_and_shift(matrix, rotation, page_size)
|
|
||||||
x1, x2 = sorted(new_matrix[0, :])
|
|
||||||
y1, y2 = sorted(new_matrix[1, :])
|
|
||||||
return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False).json_xywh()
|
|
||||||
|
|
||||||
return format_
|
x1, y1, x2, y2 = rectangle.xyxy()
|
||||||
|
matrix = np.vstack([[x1, y1], [x2, y2]]).T
|
||||||
|
new_matrix = rotate_and_shift(matrix, rotation, (width, height))
|
||||||
|
|
||||||
|
x1, x2 = sorted(new_matrix[0, :])
|
||||||
|
y1, y2 = sorted(new_matrix[1, :])
|
||||||
|
|
||||||
def convert_pixel_to_inch(pixel, dpi):
|
return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False)
|
||||||
return pixel / dpi * 72
|
|
||||||
|
|
||||||
|
|
||||||
def rotate(input_matrix, radians):
|
|
||||||
rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
|
|
||||||
|
|
||||||
return np.dot(rotation_matrix, input_matrix)
|
|
||||||
|
|
||||||
|
|
||||||
def rotate_and_shift(matrix, rotation, size, debug=False):
|
def rotate_and_shift(matrix, rotation, size, debug=False):
|
||||||
@ -109,3 +99,9 @@ def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_sh
|
|||||||
axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue")
|
axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue")
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def rotate(input_matrix, radians):
|
||||||
|
rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
|
||||||
|
|
||||||
|
return np.dot(rotation_matrix, input_matrix)
|
||||||
@ -1,43 +0,0 @@
|
|||||||
import gzip
|
|
||||||
from operator import itemgetter
|
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
from funcy import lmap
|
|
||||||
from pyinfra.server.utils import make_streamable_and_wrap_in_packing_logic
|
|
||||||
|
|
||||||
from cv_analysis.server.format import make_formatter
|
|
||||||
from cv_analysis.utils.logging import get_logger
|
|
||||||
from cv_analysis.utils.open_pdf import open_pdf
|
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
def make_streamable_analysis_fn(analysis_fn: Callable):
|
|
||||||
"""Makes an analysis function streamable for pyinfra server logic. The wrapped function then
|
|
||||||
works with data and metadata and returns a tuple or generator of tuples with data and metadata.
|
|
||||||
For more information about the server logic, see the PyInfra documentation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
analysis_fn: cv-analysis function
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
wrapped function
|
|
||||||
"""
|
|
||||||
|
|
||||||
def analyse(data: bytes, metadata: dict):
|
|
||||||
|
|
||||||
image = open_pdf(gzip.decompress(data))[0]
|
|
||||||
|
|
||||||
dpi = metadata["image_info"]["dpi"]
|
|
||||||
width, height, rotation = itemgetter("width", "height", "rotation")(metadata["page_info"])
|
|
||||||
|
|
||||||
formatter = make_formatter(dpi, (width, height), rotation)
|
|
||||||
|
|
||||||
results = map(lambda x: x.json_xywh(), analysis_fn(image))
|
|
||||||
results = {"cells": (lmap(formatter, results))}
|
|
||||||
|
|
||||||
logger.debug(f"Page {metadata['page_info'].get('index', '')}: Found {len(results['cells'])} cells.")
|
|
||||||
|
|
||||||
return b"", {**metadata, **results}
|
|
||||||
|
|
||||||
return make_streamable_and_wrap_in_packing_logic(analyse, batched=False)
|
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Iterator, Tuple
|
from typing import Iterator, Tuple
|
||||||
|
|
||||||
@ -5,20 +6,25 @@ import fitz
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]:
|
@dataclass
|
||||||
"""Stream the pages of a PDF as Tuples of page as matrix representation and page metadata.
|
class ImageMetadataPair:
|
||||||
Note: If Index is not given or evaluates to None, the whole PDF will be processed.
|
image: np.ndarray
|
||||||
"""
|
metadata: dict
|
||||||
convert_fn = partial(page_to_array_and_metadata, dpi=dpi)
|
|
||||||
|
|
||||||
|
def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
|
||||||
|
"""Streams PDF as pairs of image (matrix) and metadata.
|
||||||
|
Note: If Index is not given or evaluates to None, the whole PDF will be processed."""
|
||||||
|
convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
|
||||||
yield from map(convert_fn, stream_pages(pdf, index))
|
yield from map(convert_fn, stream_pages(pdf, index))
|
||||||
|
|
||||||
|
|
||||||
def page_to_array_and_metadata(page: fitz.Page, dpi):
|
def page_to_image_metadata_pair(page: fitz.Page, dpi):
|
||||||
metadata = get_page_info(page, dpi)
|
metadata = get_page_info(page)
|
||||||
pixmap = page.get_pixmap(dpi=dpi)
|
pixmap = page.get_pixmap(dpi=dpi)
|
||||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||||
|
|
||||||
return array, metadata
|
return ImageMetadataPair(array, metadata)
|
||||||
|
|
||||||
|
|
||||||
def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
|
def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
|
||||||
@ -30,11 +36,10 @@ def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
|
|||||||
yield pdf_handle[i]
|
yield pdf_handle[i]
|
||||||
|
|
||||||
|
|
||||||
def get_page_info(page, dpi):
|
def get_page_info(page):
|
||||||
return {
|
return {
|
||||||
"index": page.number,
|
"index": page.number,
|
||||||
"rotation": page.rotation,
|
"rotation": page.rotation,
|
||||||
"width": page.rect.width, # rotated page width in inches
|
"width": page.rect.width, # rotated page width in inches
|
||||||
"height": page.rect.height, # rotated page height in inches
|
"height": page.rect.height, # rotated page height in inches
|
||||||
"dpi": dpi,
|
|
||||||
}
|
}
|
||||||
@ -1,15 +1,11 @@
|
|||||||
import argparse
|
import argparse
|
||||||
from itertools import starmap
|
import json
|
||||||
|
from operator import itemgetter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from PIL import Image
|
import fitz
|
||||||
from funcy import lmap
|
|
||||||
|
|
||||||
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
|
from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
|
||||||
from cv_analysis.layout_parsing import parse_layout
|
|
||||||
from cv_analysis.table_parsing import parse_tables
|
|
||||||
from cv_analysis.utils.draw import draw_rectangles
|
|
||||||
from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@ -17,36 +13,35 @@ def parse_args():
|
|||||||
parser.add_argument("pdf_path")
|
parser.add_argument("pdf_path")
|
||||||
parser.add_argument("output_folder")
|
parser.add_argument("output_folder")
|
||||||
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
|
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
|
||||||
|
parser.add_argument("--verbose", action="store_true")
|
||||||
|
parser.add_argument("--silent", dest="verbose", action="store_false")
|
||||||
|
parser.set_defaults(verbose=False)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def analyse_and_annotate(images, analysis_fn):
|
def analyse_annotate_save(pdf, analysis_type, output_path, verbose):
|
||||||
result = map(analysis_fn, images)
|
pipe = make_analysis_pipeline(get_analysis_fn(analysis_type))
|
||||||
annotated_images = starmap(draw_rectangles, zip(images, result))
|
results = list(pipe(pdf))
|
||||||
return annotated_images
|
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(json.dumps(results, indent=2))
|
||||||
|
|
||||||
def save_as_pdf(images, output_folder, file_name, operation):
|
with fitz.open(stream=pdf) as pdf_handle:
|
||||||
Path(output_folder).mkdir(parents=True, exist_ok=True)
|
for result in results:
|
||||||
images = lmap(Image.fromarray, images)
|
page = pdf_handle[result["index"]]
|
||||||
images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images)
|
for rect in result["bboxes"]:
|
||||||
|
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(rect)
|
||||||
|
page.draw_rect((x1, y1, x2, y2), color=(0.5, 0.7, 0.2), width=2)
|
||||||
def get_analysis_fn(analysis_type):
|
pdf_handle.save(output_path)
|
||||||
if analysis_type == "table":
|
|
||||||
return parse_tables
|
|
||||||
elif analysis_type == "layout":
|
|
||||||
return parse_layout
|
|
||||||
elif analysis_type == "figure":
|
|
||||||
return make_figure_detection_pipeline()
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
with open(args.pdf_path, "rb") as f:
|
with open(args.pdf_path, "rb") as f:
|
||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
images, _ = zip(*pdf_to_array_and_metadata(pdf_bytes))
|
|
||||||
annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type))
|
Path(args.output_folder).mkdir(parents=True, exist_ok=True)
|
||||||
save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type)
|
output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf"
|
||||||
|
|
||||||
|
analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)
|
||||||
|
|||||||
96
scripts/measure_runtimes.py
Normal file
96
scripts/measure_runtimes.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
from functools import partial
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import fitz
|
||||||
|
import numpy as np
|
||||||
|
from funcy import lmap
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
|
from cv_analysis.server.pipeline import make_analysis_pipeline, get_analysis_fn
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("pdf_folder", help="Path to folder with PDFs to evaluate")
|
||||||
|
parser.add_argument("output_folder", help="Path to folder where the Runtime plot should be stored")
|
||||||
|
parser.add_argument("n_runs", help="Number of runs per test")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def measure(fn, n_runs):
|
||||||
|
def run(*args, **kwargs):
|
||||||
|
def _run():
|
||||||
|
start = time.time()
|
||||||
|
results = list(fn(*args, **kwargs)) # Evaluate generators
|
||||||
|
end = time.time()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
runtimes = [_run() for _ in range(n_runs)]
|
||||||
|
return np.mean(runtimes), np.std(runtimes)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
def run_tests(pdf, test_cases, n_runs):
|
||||||
|
def measure_analysis_pipe(test_case):
|
||||||
|
timed_analysis_pipe = measure(make_analysis_pipeline(get_analysis_fn(test_case)), n_runs)
|
||||||
|
return timed_analysis_pipe(pdf)
|
||||||
|
|
||||||
|
return lmap(measure_analysis_pipe, test_cases)
|
||||||
|
|
||||||
|
|
||||||
|
def to_ms_per_page(runtime, page_count):
|
||||||
|
ms_per_page = runtime / page_count * 1000
|
||||||
|
return round(ms_per_page, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def measure_pdf(pdf_path, n_runs):
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
pdf = f.read()
|
||||||
|
page_count = fitz.open(stream=pdf).page_count
|
||||||
|
format_fn = partial(to_ms_per_page, page_count=page_count)
|
||||||
|
|
||||||
|
means, std = zip(*run_tests(pdf, test_cases, n_runs=n_runs))
|
||||||
|
means, std = lmap(format_fn, means), lmap(format_fn, std)
|
||||||
|
return means, std
|
||||||
|
|
||||||
|
|
||||||
|
def plot_results_and_save(results, labels, n_runs, test_pdf_paths):
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
width = 0.2
|
||||||
|
x_labels = np.arange(len(labels))
|
||||||
|
plt.xticks(ticks=x_labels, labels=labels, rotation=90)
|
||||||
|
plt.grid(linestyle="dotted")
|
||||||
|
|
||||||
|
for idx, (result, test_pdf_path) in enumerate(zip(results, test_pdf_paths)):
|
||||||
|
x = x_labels + idx * width
|
||||||
|
means, std = result
|
||||||
|
bars = ax.bar(x, means, width, yerr=std, label=f"{test_pdf_path.stem}")
|
||||||
|
ax.bar_label(bars)
|
||||||
|
ax.set_ylabel("ms/page")
|
||||||
|
ax.set_xlabel("Cv-analysis operation")
|
||||||
|
ax.set_title(f"Cv-analysis runtime estimation {n_runs=}")
|
||||||
|
ax.legend(loc=0)
|
||||||
|
|
||||||
|
Path(args.output_folder).mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path = f"{args.output_folder}/cv_analysis_runtime_{n_runs=}.png"
|
||||||
|
plt.savefig(output_path, dpi=200, bbox_inches="tight", pad_inches=0.5)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def measure_and_save_plot(args, test_cases):
|
||||||
|
n_runs = int(args.n_runs)
|
||||||
|
measure_pdf_fn = partial(measure_pdf, n_runs=n_runs)
|
||||||
|
test_pdf_paths = list(Path(args.pdf_folder).glob("*.pdf"))
|
||||||
|
results = lmap(measure_pdf_fn, test_pdf_paths)
|
||||||
|
plot_results_and_save(results, test_cases, n_runs, test_pdf_paths)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
test_cases = ["table", "layout", "figure"]
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
measure_and_save_plot(args, test_cases)
|
||||||
@ -2,7 +2,7 @@ import fitz
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
|
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -17,8 +17,8 @@ def pdf(n_pages):
|
|||||||
|
|
||||||
@pytest.mark.parametrize("n_pages", [1])
|
@pytest.mark.parametrize("n_pages", [1])
|
||||||
def test_pdf_to_array_and_metadata(pdf):
|
def test_pdf_to_array_and_metadata(pdf):
|
||||||
for array, metadata in pdf_to_array_and_metadata(pdf):
|
for image_metadata_pair in pdf_to_image_metadata_pairs(pdf):
|
||||||
assert isinstance(array, np.ndarray)
|
assert isinstance(image_metadata_pair.image, np.ndarray)
|
||||||
assert array.shape == (2339, 1653, 3) # Height, Width, Color channels
|
assert image_metadata_pair.image.shape == (2339, 1653, 3) # Height, Width, Color channels
|
||||||
|
|
||||||
assert isinstance(metadata, dict)
|
assert isinstance(image_metadata_pair.metadata, dict)
|
||||||
Loading…
x
Reference in New Issue
Block a user