feat: adapt interface for production
This commit is contained in:
parent
681e59d24e
commit
20f8dcd336
@ -1,7 +1,6 @@
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
from typing import Generator
|
||||
from typing import Generator, Callable
|
||||
|
||||
from funcy import flatten, lmap
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
@ -10,8 +9,8 @@ from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
from cv_analysis.utils.image_extraction import extract_images_from_pdf
|
||||
from cv_analysis.table_parsing import parse_lines, parse_tables
|
||||
from cv_analysis.utils.image_extraction import extract_images_from_pdf
|
||||
from cv_analysis.utils.structures import Rectangle
|
||||
|
||||
|
||||
@ -31,12 +30,8 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
|
||||
skip_pages_without_images=table_parsing_skip_pages_without_images,
|
||||
)
|
||||
if operation == "figure":
|
||||
return make_analysis_pipeline(
|
||||
detect_figures, figure_detection_formatter, dpi=200
|
||||
)
|
||||
if (
|
||||
operation == "table_image_inference"
|
||||
): # TODO: fix pyinfra input
|
||||
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
|
||||
if operation == "table_image_inference": # TODO: fix pyinfra input
|
||||
return make_image_analysis_pipeline(
|
||||
infer_lines,
|
||||
)
|
||||
@ -46,8 +41,10 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
|
||||
|
||||
def make_image_analysis_pipeline(
|
||||
analysis_fn,
|
||||
) -> Generator[dict, bytes, None]:
|
||||
def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
|
||||
) -> Callable[[dict], Generator[dict, bytes, None]]:
|
||||
def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]:
|
||||
pdf_bytes = data["pdf"]
|
||||
vlp_output = data["vlp_output"]
|
||||
images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
||||
img_results = list(map(analysis_fn, images))
|
||||
results = map(lambda i: info[i] | img_results[i], range(len(info)))
|
||||
@ -57,9 +54,7 @@ def make_image_analysis_pipeline(
|
||||
return analyse_pipeline
|
||||
|
||||
|
||||
def make_analysis_pipeline(
|
||||
analysis_fn, formatter, dpi, skip_pages_without_images=False
|
||||
):
|
||||
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
|
||||
def analyse_pipeline(pdf: bytes, index=None):
|
||||
def parse_page(page: ImagePlus):
|
||||
image = page.asarray()
|
||||
@ -88,9 +83,7 @@ def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi)
|
||||
|
||||
def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
|
||||
def format_rect(rect: Rectangle):
|
||||
rect_plus = RectanglePlus.from_pixels(
|
||||
*rect.xyxy(), page.info, alpha=False, dpi=dpi
|
||||
)
|
||||
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
|
||||
return rect_plus.asdict(derotate=True)
|
||||
|
||||
bboxes = lmap(format_rect, rects)
|
||||
@ -100,11 +93,7 @@ def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
|
||||
|
||||
def figure_detection_formatter(rects, page, dpi):
|
||||
def format_rect(rect: Rectangle):
|
||||
rect_plus = RectanglePlus.from_pixels(
|
||||
*rect.xyxy(), page.info, alpha=False, dpi=dpi
|
||||
)
|
||||
return asdict(
|
||||
ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)
|
||||
)
|
||||
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
|
||||
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
|
||||
|
||||
return lmap(format_rect, rects)
|
||||
|
||||
@ -1,16 +1,13 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Optional, Tuple
|
||||
from typing import Callable, Optional
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
import cv2
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from numpy import ndarray as Array
|
||||
from scipy.signal import argrelextrema
|
||||
from scipy.stats import norm
|
||||
import fitz
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
|
||||
|
||||
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
||||
@ -76,9 +73,7 @@ def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
|
||||
# print(step_size)
|
||||
# xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1)))
|
||||
# print(xvals)
|
||||
kernel = np.array(
|
||||
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
|
||||
)
|
||||
kernel = np.array(list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1))))
|
||||
# print(kernel)
|
||||
maxval, minval = np.max(kernel), np.min(kernel)
|
||||
diff = maxval - minval
|
||||
@ -98,8 +93,7 @@ def min_avg_for_interval(filtered: Array, interval: int) -> float:
|
||||
|
||||
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
|
||||
performance = [
|
||||
(interval, *min_avg_for_interval(filtered, interval))
|
||||
for interval in range(min_interval, max_interval + 1)
|
||||
(interval, *min_avg_for_interval(filtered, interval)) for interval in range(min_interval, max_interval + 1)
|
||||
]
|
||||
best = min(performance, key=lambda x: x[1])
|
||||
return best[0], best[2]
|
||||
@ -148,9 +142,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
|
||||
|
||||
|
||||
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
|
||||
h, w = img.shape
|
||||
row_vals = get_lines_either(img, horizontal=True)
|
||||
col_vals = get_lines_either(img, horizontal=False)
|
||||
h, w = map(int, img.shape)
|
||||
row_vals = map(int, get_lines_either(img, horizontal=True))
|
||||
col_vals = map(int, get_lines_either(img, horizontal=False))
|
||||
|
||||
lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
|
||||
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
|
||||
|
||||
@ -1,14 +1,10 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Optional, Tuple
|
||||
from operator import itemgetter
|
||||
from typing import Iterable
|
||||
from typing import Tuple
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
from numpy import ndarray as Array
|
||||
from scipy.signal import argrelextrema
|
||||
from scipy.stats import norm
|
||||
import fitz
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def transform_image_coordinates_to_pdf_coordinates(
|
||||
@ -22,30 +18,29 @@ def transform_image_coordinates_to_pdf_coordinates(
|
||||
|
||||
|
||||
def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]:
|
||||
|
||||
with fitz.open(stream=pdf_bytes) as fh:
|
||||
|
||||
images = []
|
||||
info = []
|
||||
|
||||
for page_dict in vlp_output["data"]:
|
||||
|
||||
vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output
|
||||
|
||||
for page_dict in vlp_output:
|
||||
page_num = int(page_dict["page_idx"])
|
||||
boxes = page_dict["image_boxes"]
|
||||
boxes = page_dict["boxes"]
|
||||
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
|
||||
|
||||
page = fh[page_num] #pages[int(page_num)]
|
||||
page_pixmap = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY)
|
||||
h, w = page_pixmap.h, page_pixmap.w
|
||||
page = fh[page_num] # pages[int(page_num)]
|
||||
|
||||
for bbox in boxes:
|
||||
x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
|
||||
y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
|
||||
for box_obj in boxes:
|
||||
bbox = box_obj["box"]
|
||||
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox)
|
||||
rect = fitz.Rect((x1, y1), (x2, y2))
|
||||
rect = rect * page.transformation_matrix
|
||||
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
|
||||
shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w)
|
||||
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
|
||||
|
||||
|
||||
images.append(image)
|
||||
info.append({"pageNum": page_num, "bbox": bbox})
|
||||
info.append({"pageNum": page_num, "bbox": bbox, "uuid": box_obj["uuid"], "label": box_obj["label"]})
|
||||
|
||||
return images, info
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from sys import stdout
|
||||
from typing import Union
|
||||
|
||||
from kn_utils.logging import logger
|
||||
from pyinfra.examples import start_standard_queue_consumer
|
||||
@ -17,7 +18,7 @@ logger.reconfigure(sink=stdout, level=settings.logging.level)
|
||||
def make_dispatched_data_analysis(config):
|
||||
skip_pages_without_images = config.table_parsing.skip_pages_without_images
|
||||
|
||||
def inner(data: bytes, message: dict) -> list:
|
||||
def inner(data: Union[dict, bytes], message: dict) -> list:
|
||||
operation = message["operation"]
|
||||
analyse = get_analysis_pipeline(operation, skip_pages_without_images)
|
||||
return list(analyse(data))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user