diff --git a/src/cv_analysis/server/pipeline.py b/src/cv_analysis/server/pipeline.py index 6127f05..1642e8b 100644 --- a/src/cv_analysis/server/pipeline.py +++ b/src/cv_analysis/server/pipeline.py @@ -1,7 +1,6 @@ -import sys from dataclasses import asdict from operator import truth -from typing import Generator +from typing import Generator, Callable from funcy import flatten, lmap from pdf2img.conversion import convert_pages_to_images @@ -10,8 +9,8 @@ from pdf2img.default_objects.rectangle import RectanglePlus from cv_analysis.figure_detection.figure_detection import detect_figures from cv_analysis.table_inference import infer_lines -from cv_analysis.utils.image_extraction import extract_images_from_pdf from cv_analysis.table_parsing import parse_lines, parse_tables +from cv_analysis.utils.image_extraction import extract_images_from_pdf from cv_analysis.utils.structures import Rectangle @@ -31,12 +30,8 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru skip_pages_without_images=table_parsing_skip_pages_without_images, ) if operation == "figure": - return make_analysis_pipeline( - detect_figures, figure_detection_formatter, dpi=200 - ) - if ( - operation == "table_image_inference" - ): # TODO: fix pyinfra input + return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200) + if operation == "table_image_inference": # TODO: fix pyinfra input return make_image_analysis_pipeline( infer_lines, ) @@ -46,8 +41,10 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru def make_image_analysis_pipeline( analysis_fn, -) -> Generator[dict, bytes, None]: - def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict): +) -> Callable[[dict], Generator[dict, bytes, None]]: + def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]: + pdf_bytes = data["pdf"] + vlp_output = data["vlp_output"] images, info = extract_images_from_pdf(pdf_bytes, vlp_output) img_results = list(map(analysis_fn, images)) results = map(lambda i: info[i] | img_results[i], range(len(info))) @@ -57,9 +54,7 @@ def make_image_analysis_pipeline( return analyse_pipeline -def make_analysis_pipeline( - analysis_fn, formatter, dpi, skip_pages_without_images=False -): +def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False): def analyse_pipeline(pdf: bytes, index=None): def parse_page(page: ImagePlus): image = page.asarray() @@ -88,9 +83,7 @@ def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi) def table_parsing_cells_formatter(rects, page: ImagePlus, dpi): def format_rect(rect: Rectangle): - rect_plus = RectanglePlus.from_pixels( - *rect.xyxy(), page.info, alpha=False, dpi=dpi - ) + rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi) return rect_plus.asdict(derotate=True) bboxes = lmap(format_rect, rects) @@ -100,11 +93,7 @@ def table_parsing_cells_formatter(rects, page: ImagePlus, dpi): def figure_detection_formatter(rects, page, dpi): def format_rect(rect: Rectangle): - rect_plus = RectanglePlus.from_pixels( - *rect.xyxy(), page.info, alpha=False, dpi=dpi - ) - return asdict( - ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha) - ) + rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi) + return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)) return lmap(format_rect, rects) diff --git a/src/cv_analysis/table_inference.py b/src/cv_analysis/table_inference.py index 79b6020..3a1082a 100644 --- a/src/cv_analysis/table_inference.py +++ b/src/cv_analysis/table_inference.py @@ -1,16 +1,13 @@ from pathlib import Path -from typing import Callable, Iterable, Optional, Tuple +from typing import Callable, Optional from typing import Tuple - import cv2 import matplotlib.pyplot as plt import numpy as np from numpy import ndarray as Array from scipy.signal import argrelextrema from scipy.stats import norm -import fitz -from pdf2img.conversion import convert_pages_to_images def show_multiple(arrs: Tuple[Array], title: str = ""): @@ -76,9 +73,7 @@ def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array: # print(step_size) # xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1))) # print(xvals) - kernel = np.array( - list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1))) - ) + kernel = np.array(list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))) # print(kernel) maxval, minval = np.max(kernel), np.min(kernel) diff = maxval - minval @@ -98,8 +93,7 @@ def min_avg_for_interval(filtered: Array, interval: int) -> float: def search_intervals(filtered: Array, min_interval: int, max_interval: int): performance = [ - (interval, *min_avg_for_interval(filtered, interval)) - for interval in range(min_interval, max_interval + 1) + (interval, *min_avg_for_interval(filtered, interval)) for interval in range(min_interval, max_interval + 1) ] best = min(performance, key=lambda x: x[1]) return best[0], best[2] @@ -148,9 +142,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array: def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]: - h, w = img.shape - row_vals = get_lines_either(img, horizontal=True) - col_vals = get_lines_either(img, horizontal=False) + h, w = map(int, img.shape) + row_vals = map(int, get_lines_either(img, horizontal=True)) + col_vals = map(int, get_lines_either(img, horizontal=False)) lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [ {"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals diff --git a/src/cv_analysis/utils/image_extraction.py b/src/cv_analysis/utils/image_extraction.py index fa6b877..d5fefc4 100644 --- a/src/cv_analysis/utils/image_extraction.py +++ b/src/cv_analysis/utils/image_extraction.py @@ -1,14 +1,10 @@ -from pathlib import Path -from typing import Callable, Iterable, Optional, Tuple +from operator import itemgetter +from typing import Iterable from typing import Tuple +import fitz import numpy as np from numpy import ndarray as Array -from scipy.signal import argrelextrema -from scipy.stats import norm -import fitz -from pdf2img.conversion import convert_pages_to_images -from PIL import Image def transform_image_coordinates_to_pdf_coordinates( @@ -22,30 +18,29 @@ def transform_image_coordinates_to_pdf_coordinates( def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]: - with fitz.open(stream=pdf_bytes) as fh: - images = [] info = [] - - for page_dict in vlp_output["data"]: + + vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output + + for page_dict in vlp_output: page_num = int(page_dict["page_idx"]) - boxes = page_dict["image_boxes"] + boxes = page_dict["boxes"] boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes) - page = fh[page_num] #pages[int(page_num)] - page_pixmap = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY) - h, w = page_pixmap.h, page_pixmap.w + page = fh[page_num] # pages[int(page_num)] - for bbox in boxes: - x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"])) - y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"])) + for box_obj in boxes: + bbox = box_obj["box"] + x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox) rect = fitz.Rect((x1, y1), (x2, y2)) + rect = rect * page.transformation_matrix pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY) shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w) image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape) - + images.append(image) - info.append({"pageNum": page_num, "bbox": bbox}) + info.append({"pageNum": page_num, "bbox": bbox, "uuid": box_obj["uuid"], "label": box_obj["label"]}) return images, info diff --git a/src/serve.py b/src/serve.py index f3331f2..78cb024 100644 --- a/src/serve.py +++ b/src/serve.py @@ -1,4 +1,5 @@ from sys import stdout +from typing import Union from kn_utils.logging import logger from pyinfra.examples import start_standard_queue_consumer @@ -17,7 +18,7 @@ logger.reconfigure(sink=stdout, level=settings.logging.level) def make_dispatched_data_analysis(config): skip_pages_without_images = config.table_parsing.skip_pages_without_images - def inner(data: bytes, message: dict) -> list: + def inner(data: Union[dict, bytes], message: dict) -> list: operation = message["operation"] analyse = get_analysis_pipeline(operation, skip_pages_without_images) return list(analyse(data))