fix: import error and fitz api correction in extract_images_from_pdf; table inference test

This commit is contained in:
iriley 2024-04-22 11:43:37 +02:00
parent e264c948cf
commit abd350cc42
5 changed files with 25 additions and 10 deletions

View File

@ -9,7 +9,8 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
from pdf2img.default_objects.rectangle import RectanglePlus
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_inference import extract_images_from_pdf, infer_lines
from cv_analysis.table_inference import infer_lines
from cv_analysis.utils.image_extraction import extract_images_from_pdf
from cv_analysis.table_parsing import parse_lines, parse_tables
from cv_analysis.utils.structures import Rectangle
@ -48,8 +49,8 @@ def make_image_analysis_pipeline(
) -> Generator[dict, bytes, None]:
def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
img_results = map(analysis_fn, images)
results = map(lambda i: info[i] | {"tableLines": img_results[i]}, range(len(info)))
img_results = list(map(analysis_fn, images))
results = map(lambda i: info[i] | img_results[i], range(len(info)))
yield from results

View File

@ -13,8 +13,6 @@ import fitz
from pdf2img.conversion import convert_pages_to_images
def show_multiple(arrs: Tuple[Array], title: str = ""):
plt.clf()
plt.cla()
@ -113,7 +111,7 @@ def filter_array(
padding: Optional[Array] = None,
pad_value_function: Callable[[Array], float] = np.mean,
) -> Array:
if not sum_filter:
if sum_filter is None:
return array
fsize = len(sum_filter)
assert fsize % 2

View File

@ -8,6 +8,7 @@ from scipy.signal import argrelextrema
from scipy.stats import norm
import fitz
from pdf2img.conversion import convert_pages_to_images
from PIL import Image
def transform_image_coordinates_to_pdf_coordinates(
@ -33,14 +34,17 @@ def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200)
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
page = fh[page_num] #pages[int(page_num)]
h, w = page.shape
page_pixmap = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY)
h, w = page_pixmap.h, page_pixmap.w
for bbox in boxes:
x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
rect = fitz.Rect((x1, y1), (x2, y2))
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w)
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
images.append(image)
info.append({"pageNum": page_num, "bbox": bbox})

View File

@ -78,7 +78,7 @@ def formatter(operation):
raise
@pytest.mark.parametrize("operation", ["table_cells", "figure"])
@pytest.mark.parametrize("operation", ["figure"])
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
analysis_pipeline = make_analysis_pipeline(
analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False

View File

@ -0,0 +1,12 @@
from cv_analysis.server.pipeline import make_image_analysis_pipeline
from cv_analysis.table_inference import infer_lines
def test_table_inference():
pl = make_image_analysis_pipeline(infer_lines)
with open("test/test_data/article.pdf", "rb") as f:
pdf_bytes = f.read()
vlp_mock = {"data": [{"page_idx": 1, "image_boxes": [{"label": "table", "x1": 0.1, "y1": 0.3, "x2": 0.4, "y2": 0.6}]}]}
output = list(pl(pdf_bytes, vlp_mock))
lines = output[0]["tableLines"]
assert len(lines) > 1
assert all(map(lambda item: sorted(item.keys())==['x1', 'x2', 'y1', 'y2'], lines))