feat: adapt interface for production

This commit is contained in:
Julius Unverfehrt 2024-04-22 15:42:34 +02:00 committed by iriley
parent 681e59d24e
commit 20f8dcd336
4 changed files with 35 additions and 56 deletions

View File

@ -1,7 +1,6 @@
import sys
from dataclasses import asdict
from operator import truth
from typing import Generator
from typing import Generator, Callable
from funcy import flatten, lmap
from pdf2img.conversion import convert_pages_to_images
@ -10,8 +9,8 @@ from pdf2img.default_objects.rectangle import RectanglePlus
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_inference import infer_lines
from cv_analysis.utils.image_extraction import extract_images_from_pdf
from cv_analysis.table_parsing import parse_lines, parse_tables
from cv_analysis.utils.image_extraction import extract_images_from_pdf
from cv_analysis.utils.structures import Rectangle
@ -31,12 +30,8 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
skip_pages_without_images=table_parsing_skip_pages_without_images,
)
if operation == "figure":
return make_analysis_pipeline(
detect_figures, figure_detection_formatter, dpi=200
)
if (
operation == "table_image_inference"
): # TODO: fix pyinfra input
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
if operation == "table_image_inference": # TODO: fix pyinfra input
return make_image_analysis_pipeline(
infer_lines,
)
@ -46,8 +41,10 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
def make_image_analysis_pipeline(
analysis_fn,
) -> Generator[dict, bytes, None]:
def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
) -> Callable[[dict], Generator[dict, bytes, None]]:
def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]:
pdf_bytes = data["pdf"]
vlp_output = data["vlp_output"]
images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
img_results = list(map(analysis_fn, images))
results = map(lambda i: info[i] | img_results[i], range(len(info)))
@ -57,9 +54,7 @@ def make_image_analysis_pipeline(
return analyse_pipeline
def make_analysis_pipeline(
analysis_fn, formatter, dpi, skip_pages_without_images=False
):
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
@ -88,9 +83,7 @@ def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi)
def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(
*rect.xyxy(), page.info, alpha=False, dpi=dpi
)
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return rect_plus.asdict(derotate=True)
bboxes = lmap(format_rect, rects)
@ -100,11 +93,7 @@ def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(
*rect.xyxy(), page.info, alpha=False, dpi=dpi
)
return asdict(
ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)
)
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
return lmap(format_rect, rects)

View File

@ -1,16 +1,13 @@
from pathlib import Path
from typing import Callable, Iterable, Optional, Tuple
from typing import Callable, Optional
from typing import Tuple
import cv2
import matplotlib.pyplot as plt
import numpy as np
from numpy import ndarray as Array
from scipy.signal import argrelextrema
from scipy.stats import norm
import fitz
from pdf2img.conversion import convert_pages_to_images
def show_multiple(arrs: Tuple[Array], title: str = ""):
@ -76,9 +73,7 @@ def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
# print(step_size)
# xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1)))
# print(xvals)
kernel = np.array(
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
)
kernel = np.array(list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1))))
# print(kernel)
maxval, minval = np.max(kernel), np.min(kernel)
diff = maxval - minval
@ -98,8 +93,7 @@ def min_avg_for_interval(filtered: Array, interval: int) -> float:
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
performance = [
(interval, *min_avg_for_interval(filtered, interval))
for interval in range(min_interval, max_interval + 1)
(interval, *min_avg_for_interval(filtered, interval)) for interval in range(min_interval, max_interval + 1)
]
best = min(performance, key=lambda x: x[1])
return best[0], best[2]
@ -148,9 +142,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
h, w = img.shape
row_vals = get_lines_either(img, horizontal=True)
col_vals = get_lines_either(img, horizontal=False)
h, w = map(int, img.shape)
row_vals = map(int, get_lines_either(img, horizontal=True))
col_vals = map(int, get_lines_either(img, horizontal=False))
lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals

View File

@ -1,14 +1,10 @@
from pathlib import Path
from typing import Callable, Iterable, Optional, Tuple
from operator import itemgetter
from typing import Iterable
from typing import Tuple
import fitz
import numpy as np
from numpy import ndarray as Array
from scipy.signal import argrelextrema
from scipy.stats import norm
import fitz
from pdf2img.conversion import convert_pages_to_images
from PIL import Image
def transform_image_coordinates_to_pdf_coordinates(
@ -22,30 +18,29 @@ def transform_image_coordinates_to_pdf_coordinates(
def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]:
with fitz.open(stream=pdf_bytes) as fh:
images = []
info = []
for page_dict in vlp_output["data"]:
vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output
for page_dict in vlp_output:
page_num = int(page_dict["page_idx"])
boxes = page_dict["image_boxes"]
boxes = page_dict["boxes"]
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
page = fh[page_num] #pages[int(page_num)]
page_pixmap = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY)
h, w = page_pixmap.h, page_pixmap.w
page = fh[page_num] # pages[int(page_num)]
for bbox in boxes:
x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
for box_obj in boxes:
bbox = box_obj["box"]
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox)
rect = fitz.Rect((x1, y1), (x2, y2))
rect = rect * page.transformation_matrix
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w)
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
images.append(image)
info.append({"pageNum": page_num, "bbox": bbox})
info.append({"pageNum": page_num, "bbox": bbox, "uuid": box_obj["uuid"], "label": box_obj["label"]})
return images, info

View File

@ -1,4 +1,5 @@
from sys import stdout
from typing import Union
from kn_utils.logging import logger
from pyinfra.examples import start_standard_queue_consumer
@ -17,7 +18,7 @@ logger.reconfigure(sink=stdout, level=settings.logging.level)
def make_dispatched_data_analysis(config):
skip_pages_without_images = config.table_parsing.skip_pages_without_images
def inner(data: bytes, message: dict) -> list:
def inner(data: Union[dict, bytes], message: dict) -> list:
operation = message["operation"]
analyse = get_analysis_pipeline(operation, skip_pages_without_images)
return list(analyse(data))