diff --git a/README.md b/README.md index 9584d3f..873b104 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # cv-analysis — Visual (CV-Based) Document Parsing - +parse_pdf() This repository implements computer vision based approaches for detecting and parsing visual features such as tables or previous redactions in documents. diff --git a/scripts/parse_pdf.py b/scripts/parse_pdf.py index e6052ca..d90a5f4 100644 --- a/scripts/parse_pdf.py +++ b/scripts/parse_pdf.py @@ -2,14 +2,17 @@ import json from cv_analysis.server.pipeline import make_image_analysis_pipeline from cv_analysis.table_inference import infer_lines +from cv_analysis.utils.annotate import annotate_pdf pipe = make_image_analysis_pipeline(infer_lines) # FIXME: Implement argparsing -pdf_bytes = open("/home/junverfehrt/Documents/rosario_test_file.pdf", "rb").read() -vlp_output = json.load(open("/home/junverfehrt/Documents/rosario_test_file_vlp.json", "r")) +pdf_bytes = open("test/test_data/article.pdf", "rb").read() +vlp_output = json.load(open("test/test_data/article.json", "r")) best_result = list(pipe(data={"pdf": pdf_bytes, "vlp_output": vlp_output})) -print(best_result) +# print(best_result) + +annotate_pdf(pdf_bytes, best_result, output_path = "/tmp/deine-mutter.pdf") \ No newline at end of file diff --git a/src/cv_analysis/server/pipeline.py b/src/cv_analysis/server/pipeline.py index 08a29f9..539e084 100644 --- a/src/cv_analysis/server/pipeline.py +++ b/src/cv_analysis/server/pipeline.py @@ -1,5 +1,5 @@ from dataclasses import asdict -from operator import truth +from operator import itemgetter, truth from typing import Generator, Callable from funcy import flatten, lmap @@ -46,8 +46,15 @@ def make_image_analysis_pipeline( pdf_bytes = data["pdf"] vlp_output = data["vlp_output"] images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output) + # rel_bboxes = map() img_results = lmap(analysis_fn, images) - img_results = lmap(transform_table_lines_by_page_info, img_results, page_info) + def make_offsets(): + ... + + offsets = map(itemgetter("x1", "y2"), map(itemgetter("bbox"), info)) + # print("before", img_results) + img_results = lmap(transform_table_lines_by_page_info, img_results, offsets, page_info) + # print("after", img_results) results = map(lambda i: info[i] | img_results[i], range(len(info))) yield from results diff --git a/src/cv_analysis/table_inference.py b/src/cv_analysis/table_inference.py index 3a1082a..601a7aa 100644 --- a/src/cv_analysis/table_inference.py +++ b/src/cv_analysis/table_inference.py @@ -124,14 +124,15 @@ FILTERS = { def get_lines_either(table_array: Array, horizontal=True) -> Array: key = "row" if horizontal else "col" - THRESHOLD = 0.3 + THRESHOLD = 0.4 filters = FILTERS sums = np.mean(table_array, axis=int(horizontal)) sums = np.maximum(sums, (sums < THRESHOLD)) # save_plot(rows, name=save_path / "rows", title="raw row averages") filtered_sums = filter_array(sums, FILTERS[key][1]) # ROW_FILTER1) - filtered_sums = filter_array(sums, FILTERS[key][2]) # ROW_FILTER2) + if not horizontal: + filtered_sums = filter_array(filtered_sums, FILTERS[key][2]) # ROW_FILTER2) lines = argrelextrema(filtered_sums, np.greater)[0] return lines @@ -142,6 +143,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array: def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]: + # cv2.GaussianBlur(img,(15,5),cv2.BORDER_DEFAULT) h, w = map(int, img.shape) row_vals = map(int, get_lines_either(img, horizontal=True)) col_vals = map(int, get_lines_either(img, horizontal=False)) diff --git a/src/cv_analysis/utils/annotate.py b/src/cv_analysis/utils/annotate.py index 5822186..f02f865 100644 --- a/src/cv_analysis/utils/annotate.py +++ b/src/cv_analysis/utils/annotate.py @@ -7,29 +7,32 @@ import fitz from kn_utils.logging import logger -def annotate_pdf(pdf: Union[str, bytes, Path], predictions, output_path: Union[str, Path] = None): +def annotate_pdf(pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None): pdf_bytes = provide_byte_stream(pdf) with fitz.open(stream=pdf_bytes) as pdf_handle: - for prediction in predictions: + for page_annotations in annotations: # FIXME: Adapt to line drawing - index = prediction["page_idx"] - annotate_page(pdf_handle[index], prediction) + index = page_annotations["pageNum"] + annotate_page(pdf_handle[index], page_annotations) output_path = output_path or "/tmp/annotated.pdf" pdf_handle.save(output_path) logger.info(f"Annotated PDF saved to {output_path}") def annotate_page(page: fitz.Page, prediction): - for box in prediction["boxes"]: - bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"]) - label, probability, uuid = itemgetter("label", "probability", "uuid")(box) - - bbox = mirror_on_x_axis(bbox, page.bound().height) - x0, y0, x1, y1 = bbox - page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2) - label_x, label_y = x0, y0 - 5 - page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1)) + # for box in prediction["boxes"]: + # bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"]) + # label, probability, uuid = itemgetter("label", "probability", "uuid")(box) + # bbox = mirror_on_x_axis(bbox, page.bound().height) + # x0, y0, x1, y1 = bbox + # page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2) + # label_x, label_y = x0, y0 - 5 + # page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1)) + for line in prediction["tableLines"]: + start = itemgetter("x1", "y1")(line) + end = itemgetter("x2", "y2")(line) + page.draw_line(start, end) return page diff --git a/src/cv_analysis/utils/display.py b/src/cv_analysis/utils/display.py index 0d3f2a6..9c8ca6e 100644 --- a/src/cv_analysis/utils/display.py +++ b/src/cv_analysis/utils/display.py @@ -1,6 +1,11 @@ +import os import cv2 from matplotlib import pyplot as plt +if os.environ["USER"] == "isaac": + import matplotlib + matplotlib.use('module://matplotlib-backend-wezterm') + def show_image_cv2(image, maxdim=700): h, w, c = image.shape diff --git a/src/cv_analysis/utils/image_extraction.py b/src/cv_analysis/utils/image_extraction.py index 4ff32e6..4121eff 100644 --- a/src/cv_analysis/utils/image_extraction.py +++ b/src/cv_analysis/utils/image_extraction.py @@ -6,9 +6,11 @@ from typing import Tuple import fitz import numpy as np -from funcy import compose +from funcy import compose, lfilter from numpy import ndarray as Array +from kn_utils.logging import logger + @dataclass class PageInfo: @@ -16,7 +18,12 @@ class PageInfo: rotation_matrix: fitz.Matrix transformation_matrix: fitz.Matrix dpi: int - + width: int | float + height: int | float + image_width: int | float + image_height: int | float + rotation: int + def transform_image_coordinates_to_pdf_coordinates( bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None @@ -28,22 +35,58 @@ def transform_image_coordinates_to_pdf_coordinates( return rect.x0, rect.y0, rect.x1, rect.y1 -def transform_table_lines_by_page_info(bboxes: dict, page_info: PageInfo) -> dict: +def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable[float]: + pdf_h, pdf_w = page_info.height, page_info.width + if page_info.rotation in {90, 270}: + pdf_h, pdf_w = pdf_w, pdf_h + pix_h, pix_w = page_info.image_height, page_info.image_width + ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w + round3 = lambda x: tuple(map(lambda y: round(y, 3), x)) + ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3((ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)) + new_bbox = round3((bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)) + # logger.info(f"{pdf_h=}, {pix_h=}, {pdf_w=}, {pix_w=}, {ratio_w=}, {ratio_h=}") + # logger.info(round3(bbox)) + # logger.info(new_bbox) + return new_bbox + + +def transform_table_lines_by_page_info(bboxes: dict, offsets: tuple, page_info: PageInfo) -> dict: # FIXME: Also convert image info? Is image info necessary? # Also, the resulting lines are not in the table bbox, is this okay? - transform = partial( - transform_image_coordinates_to_pdf_coordinates, - rotation_matrix=page_info.rotation_matrix, - transformation_matrix=page_info.transformation_matrix, - dpi=page_info.dpi, - ) + + # transform = partial( + # transform_image_coordinates_to_pdf_coordinates, + # rotation_matrix=page_info.rotation_matrix, + # transformation_matrix=page_info.transformation_matrix, + # dpi=page_info.dpi, + # ) + + transform = partial(rescale_to_pdf, page_info=page_info) + logger.info(f"{offsets=}") + + def apply_offsets(line: tuple) -> tuple: + x1, y1, x2, y2 = line + offset_x, offset_y = offsets + offset_y = page_info.height - offset_y # - (y2 * (y1 != y2)) + logger.info((f"new offsets: {offset_x}, {offset_y}")) + + return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y) + unpack = itemgetter("x1", "y1", "x2", "y2") pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]} - convert = compose(pack, transform, unpack) - + # convert = compose(pack, transform, apply_offsets, unpack) + convert = compose(pack, apply_offsets, transform, unpack) + # convert = compose(pack, transform, unpack) + table_lines = bboxes.get("tableLines", []) transformed_lines = list(map(convert, table_lines)) - bboxes["tableLines"] = transformed_lines + bboxes["tableLines"] = transformed_lines #lfilter(lambda b: b['y1']==b['y2'], transformed_lines) + import json + for i in range(len(table_lines)): + logger.info(json.dumps(table_lines[i], indent=4)) + logger.info(json.dumps(transformed_lines[i], indent=4)) + logger.info('') + # exit() return bboxes @@ -64,8 +107,20 @@ def extract_images_from_pdf( boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes) page = fh[page_num] # pages[int(page_num)] + page.wrap_contents() # TODO: Workaround to be able to transform the image coordinates to pdf coordinates in a later step. - current_page_info = PageInfo(page_num, page.rotation_matrix, page.transformation_matrix, dpi) + page_image = page.get_pixmap(dpi=200) + # import IPython; IPython.embed() + current_page_info = PageInfo( + page_num, + page.rotation_matrix, + page.transformation_matrix, + dpi, + *page.rect[-2:], + page_image.w, + page_image.h, + page.rotation, + ) for box_obj in boxes: bbox = box_obj["box"]