fix: import error and fitz api correction in extract_images_from_pdf; table inference test
This commit is contained in:
parent
e264c948cf
commit
abd350cc42
@ -9,7 +9,8 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.table_inference import extract_images_from_pdf, infer_lines
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
from cv_analysis.utils.image_extraction import extract_images_from_pdf
|
||||
from cv_analysis.table_parsing import parse_lines, parse_tables
|
||||
from cv_analysis.utils.structures import Rectangle
|
||||
|
||||
@ -48,8 +49,8 @@ def make_image_analysis_pipeline(
|
||||
) -> Generator[dict, bytes, None]:
|
||||
def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
|
||||
images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
||||
img_results = map(analysis_fn, images)
|
||||
results = map(lambda i: info[i] | {"tableLines": img_results[i]}, range(len(info)))
|
||||
img_results = list(map(analysis_fn, images))
|
||||
results = map(lambda i: info[i] | img_results[i], range(len(info)))
|
||||
|
||||
yield from results
|
||||
|
||||
|
||||
@ -13,8 +13,6 @@ import fitz
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
|
||||
|
||||
|
||||
|
||||
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
@ -113,7 +111,7 @@ def filter_array(
|
||||
padding: Optional[Array] = None,
|
||||
pad_value_function: Callable[[Array], float] = np.mean,
|
||||
) -> Array:
|
||||
if not sum_filter:
|
||||
if sum_filter is None:
|
||||
return array
|
||||
fsize = len(sum_filter)
|
||||
assert fsize % 2
|
||||
|
||||
@ -8,6 +8,7 @@ from scipy.signal import argrelextrema
|
||||
from scipy.stats import norm
|
||||
import fitz
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def transform_image_coordinates_to_pdf_coordinates(
|
||||
@ -33,14 +34,17 @@ def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200)
|
||||
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
|
||||
|
||||
page = fh[page_num] #pages[int(page_num)]
|
||||
h, w = page.shape
|
||||
|
||||
page_pixmap = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY)
|
||||
h, w = page_pixmap.h, page_pixmap.w
|
||||
|
||||
for bbox in boxes:
|
||||
x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
|
||||
y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
|
||||
rect = fitz.Rect((x1, y1), (x2, y2))
|
||||
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
|
||||
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w)
|
||||
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
|
||||
|
||||
images.append(image)
|
||||
info.append({"pageNum": page_num, "bbox": bbox})
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ def formatter(operation):
|
||||
raise
|
||||
|
||||
|
||||
@pytest.mark.parametrize("operation", ["table_cells", "figure"])
|
||||
@pytest.mark.parametrize("operation", ["figure"])
|
||||
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
|
||||
analysis_pipeline = make_analysis_pipeline(
|
||||
analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False
|
||||
|
||||
12
test/unit_tests/table_inference_test.py
Normal file
12
test/unit_tests/table_inference_test.py
Normal file
@ -0,0 +1,12 @@
|
||||
from cv_analysis.server.pipeline import make_image_analysis_pipeline
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
|
||||
def test_table_inference():
|
||||
pl = make_image_analysis_pipeline(infer_lines)
|
||||
with open("test/test_data/article.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
vlp_mock = {"data": [{"page_idx": 1, "image_boxes": [{"label": "table", "x1": 0.1, "y1": 0.3, "x2": 0.4, "y2": 0.6}]}]}
|
||||
output = list(pl(pdf_bytes, vlp_mock))
|
||||
lines = output[0]["tableLines"]
|
||||
assert len(lines) > 1
|
||||
assert all(map(lambda item: sorted(item.keys())==['x1', 'x2', 'y1', 'y2'], lines))
|
||||
Loading…
x
Reference in New Issue
Block a user