Merge in RR/cv-analysis from add-pdf2array-func to master
Squashed commit of the following:
commit 6e6e9a509ede0abf28fb93a2042960efcc9453bd
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 20 09:12:01 2022 +0200
update script with layout parsing, refactor pdf2array
commit 191bc71f58aa5c07b0cadbdb7067cd72c3d8858b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 20 09:10:06 2022 +0200
update script with layout parsing, refactor pdf2array
commit 25201bbb4151a23784193181272d379232877d2f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 20 08:33:20 2022 +0200
add pdf2array functionality
25 lines
679 B
Python
25 lines
679 B
Python
import fitz
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
|
|
|
|
|
|
@pytest.fixture
|
|
def pdf(n_pages):
|
|
doc = fitz.open()
|
|
for n in range(n_pages):
|
|
page = doc.new_page()
|
|
where = fitz.Point(50, 100)
|
|
page.insert_text(where, "De gustibus non est disputandum.", fontsize=30)
|
|
return doc.write()
|
|
|
|
|
|
@pytest.mark.parametrize("n_pages", [1])
|
|
def test_pdf_to_array_and_metadata(pdf):
|
|
for array, metadata in pdf_to_array_and_metadata(pdf):
|
|
assert isinstance(array, np.ndarray)
|
|
assert array.shape == (2339, 1653, 3) # Height, Width, Color channels
|
|
|
|
assert isinstance(metadata, dict)
|