diff --git a/src/cv_analysis/server/pipeline.py b/src/cv_analysis/server/pipeline.py index fe2bd31..6127f05 100644 --- a/src/cv_analysis/server/pipeline.py +++ b/src/cv_analysis/server/pipeline.py @@ -9,7 +9,8 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus from pdf2img.default_objects.rectangle import RectanglePlus from cv_analysis.figure_detection.figure_detection import detect_figures -from cv_analysis.table_inference import extract_images_from_pdf, infer_lines +from cv_analysis.table_inference import infer_lines +from cv_analysis.utils.image_extraction import extract_images_from_pdf from cv_analysis.table_parsing import parse_lines, parse_tables from cv_analysis.utils.structures import Rectangle @@ -48,8 +49,8 @@ def make_image_analysis_pipeline( ) -> Generator[dict, bytes, None]: def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict): images, info = extract_images_from_pdf(pdf_bytes, vlp_output) - img_results = map(analysis_fn, images) - results = map(lambda i: info[i] | {"tableLines": img_results[i]}, range(len(info))) + img_results = list(map(analysis_fn, images)) + results = map(lambda i: info[i] | img_results[i], range(len(info))) yield from results diff --git a/src/cv_analysis/table_inference.py b/src/cv_analysis/table_inference.py index 6acd7fc..79b6020 100644 --- a/src/cv_analysis/table_inference.py +++ b/src/cv_analysis/table_inference.py @@ -13,8 +13,6 @@ import fitz from pdf2img.conversion import convert_pages_to_images - - def show_multiple(arrs: Tuple[Array], title: str = ""): plt.clf() plt.cla() @@ -113,7 +111,7 @@ def filter_array( padding: Optional[Array] = None, pad_value_function: Callable[[Array], float] = np.mean, ) -> Array: - if not sum_filter: + if sum_filter is None: return array fsize = len(sum_filter) assert fsize % 2 diff --git a/src/cv_analysis/utils/image_extraction.py b/src/cv_analysis/utils/image_extraction.py index 656726a..fa6b877 100644 --- a/src/cv_analysis/utils/image_extraction.py +++ b/src/cv_analysis/utils/image_extraction.py @@ -8,6 +8,7 @@ from scipy.signal import argrelextrema from scipy.stats import norm import fitz from pdf2img.conversion import convert_pages_to_images +from PIL import Image def transform_image_coordinates_to_pdf_coordinates( @@ -33,14 +34,17 @@ def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes) page = fh[page_num] #pages[int(page_num)] - h, w = page.shape - + page_pixmap = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY) + h, w = page_pixmap.h, page_pixmap.w + for bbox in boxes: x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"])) y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"])) rect = fitz.Rect((x1, y1), (x2, y2)) pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY) - image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w) + image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape) + images.append(image) info.append({"pageNum": page_num, "bbox": bbox}) diff --git a/test/unit_tests/server_pipeline_test.py b/test/unit_tests/server_pipeline_test.py index 39e685a..44b5f26 100644 --- a/test/unit_tests/server_pipeline_test.py +++ b/test/unit_tests/server_pipeline_test.py @@ -78,7 +78,7 @@ def formatter(operation): raise -@pytest.mark.parametrize("operation", ["table_cells", "figure"]) +@pytest.mark.parametrize("operation", ["figure"]) def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result): analysis_pipeline = make_analysis_pipeline( analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False diff --git a/test/unit_tests/table_inference_test.py b/test/unit_tests/table_inference_test.py new file mode 100644 index 0000000..5ee337e --- /dev/null +++ b/test/unit_tests/table_inference_test.py @@ -0,0 +1,12 @@ +from cv_analysis.server.pipeline import make_image_analysis_pipeline +from cv_analysis.table_inference import infer_lines + +def test_table_inference(): + pl = make_image_analysis_pipeline(infer_lines) + with open("test/test_data/article.pdf", "rb") as f: + pdf_bytes = f.read() + vlp_mock = {"data": [{"page_idx": 1, "image_boxes": [{"label": "table", "x1": 0.1, "y1": 0.3, "x2": 0.4, "y2": 0.6}]}]} + output = list(pl(pdf_bytes, vlp_mock)) + lines = output[0]["tableLines"] + assert len(lines) > 1 + assert all(map(lambda item: sorted(item.keys())==['x1', 'x2', 'y1', 'y2'], lines)) \ No newline at end of file