fix: coordinate remapping

This commit is contained in:
iriley 2024-04-25 13:26:45 +02:00
parent 0f0fe516d0
commit 102617fe2f
7 changed files with 109 additions and 34 deletions

View File

@ -1,5 +1,5 @@
# cv-analysis — Visual (CV-Based) Document Parsing
parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.

View File

@ -2,14 +2,17 @@ import json
from cv_analysis.server.pipeline import make_image_analysis_pipeline
from cv_analysis.table_inference import infer_lines
from cv_analysis.utils.annotate import annotate_pdf
pipe = make_image_analysis_pipeline(infer_lines)
# FIXME: Implement argparsing
pdf_bytes = open("/home/junverfehrt/Documents/rosario_test_file.pdf", "rb").read()
vlp_output = json.load(open("/home/junverfehrt/Documents/rosario_test_file_vlp.json", "r"))
pdf_bytes = open("test/test_data/article.pdf", "rb").read()
vlp_output = json.load(open("test/test_data/article.json", "r"))
best_result = list(pipe(data={"pdf": pdf_bytes, "vlp_output": vlp_output}))
print(best_result)
# print(best_result)
annotate_pdf(pdf_bytes, best_result, output_path = "/tmp/deine-mutter.pdf")

View File

@ -1,5 +1,5 @@
from dataclasses import asdict
from operator import truth
from operator import itemgetter, truth
from typing import Generator, Callable
from funcy import flatten, lmap
@ -46,8 +46,15 @@ def make_image_analysis_pipeline(
pdf_bytes = data["pdf"]
vlp_output = data["vlp_output"]
images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
# rel_bboxes = map()
img_results = lmap(analysis_fn, images)
img_results = lmap(transform_table_lines_by_page_info, img_results, page_info)
def make_offsets():
...
offsets = map(itemgetter("x1", "y2"), map(itemgetter("bbox"), info))
# print("before", img_results)
img_results = lmap(transform_table_lines_by_page_info, img_results, offsets, page_info)
# print("after", img_results)
results = map(lambda i: info[i] | img_results[i], range(len(info)))
yield from results

View File

@ -124,14 +124,15 @@ FILTERS = {
def get_lines_either(table_array: Array, horizontal=True) -> Array:
key = "row" if horizontal else "col"
THRESHOLD = 0.3
THRESHOLD = 0.4
filters = FILTERS
sums = np.mean(table_array, axis=int(horizontal))
sums = np.maximum(sums, (sums < THRESHOLD))
# save_plot(rows, name=save_path / "rows", title="raw row averages")
filtered_sums = filter_array(sums, FILTERS[key][1]) # ROW_FILTER1)
filtered_sums = filter_array(sums, FILTERS[key][2]) # ROW_FILTER2)
if not horizontal:
filtered_sums = filter_array(filtered_sums, FILTERS[key][2]) # ROW_FILTER2)
lines = argrelextrema(filtered_sums, np.greater)[0]
return lines
@ -142,6 +143,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
# cv2.GaussianBlur(img,(15,5),cv2.BORDER_DEFAULT)
h, w = map(int, img.shape)
row_vals = map(int, get_lines_either(img, horizontal=True))
col_vals = map(int, get_lines_either(img, horizontal=False))

View File

@ -7,29 +7,32 @@ import fitz
from kn_utils.logging import logger
def annotate_pdf(pdf: Union[str, bytes, Path], predictions, output_path: Union[str, Path] = None):
def annotate_pdf(pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None):
pdf_bytes = provide_byte_stream(pdf)
with fitz.open(stream=pdf_bytes) as pdf_handle:
for prediction in predictions:
for page_annotations in annotations:
# FIXME: Adapt to line drawing
index = prediction["page_idx"]
annotate_page(pdf_handle[index], prediction)
index = page_annotations["pageNum"]
annotate_page(pdf_handle[index], page_annotations)
output_path = output_path or "/tmp/annotated.pdf"
pdf_handle.save(output_path)
logger.info(f"Annotated PDF saved to {output_path}")
def annotate_page(page: fitz.Page, prediction):
for box in prediction["boxes"]:
bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
bbox = mirror_on_x_axis(bbox, page.bound().height)
x0, y0, x1, y1 = bbox
page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
label_x, label_y = x0, y0 - 5
page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1))
# for box in prediction["boxes"]:
# bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
# label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
# bbox = mirror_on_x_axis(bbox, page.bound().height)
# x0, y0, x1, y1 = bbox
# page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
# label_x, label_y = x0, y0 - 5
# page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1))
for line in prediction["tableLines"]:
start = itemgetter("x1", "y1")(line)
end = itemgetter("x2", "y2")(line)
page.draw_line(start, end)
return page

View File

@ -1,6 +1,11 @@
import os
import cv2
from matplotlib import pyplot as plt
if os.environ["USER"] == "isaac":
import matplotlib
matplotlib.use('module://matplotlib-backend-wezterm')
def show_image_cv2(image, maxdim=700):
h, w, c = image.shape

View File

@ -6,9 +6,11 @@ from typing import Tuple
import fitz
import numpy as np
from funcy import compose
from funcy import compose, lfilter
from numpy import ndarray as Array
from kn_utils.logging import logger
@dataclass
class PageInfo:
@ -16,7 +18,12 @@ class PageInfo:
rotation_matrix: fitz.Matrix
transformation_matrix: fitz.Matrix
dpi: int
width: int | float
height: int | float
image_width: int | float
image_height: int | float
rotation: int
def transform_image_coordinates_to_pdf_coordinates(
bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None
@ -28,22 +35,58 @@ def transform_image_coordinates_to_pdf_coordinates(
return rect.x0, rect.y0, rect.x1, rect.y1
def transform_table_lines_by_page_info(bboxes: dict, page_info: PageInfo) -> dict:
def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable[float]:
pdf_h, pdf_w = page_info.height, page_info.width
if page_info.rotation in {90, 270}:
pdf_h, pdf_w = pdf_w, pdf_h
pix_h, pix_w = page_info.image_height, page_info.image_width
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3((ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h))
new_bbox = round3((bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h))
# logger.info(f"{pdf_h=}, {pix_h=}, {pdf_w=}, {pix_w=}, {ratio_w=}, {ratio_h=}")
# logger.info(round3(bbox))
# logger.info(new_bbox)
return new_bbox
def transform_table_lines_by_page_info(bboxes: dict, offsets: tuple, page_info: PageInfo) -> dict:
# FIXME: Also convert image info? Is image info necessary?
# Also, the resulting lines are not in the table bbox, is this okay?
transform = partial(
transform_image_coordinates_to_pdf_coordinates,
rotation_matrix=page_info.rotation_matrix,
transformation_matrix=page_info.transformation_matrix,
dpi=page_info.dpi,
)
# transform = partial(
# transform_image_coordinates_to_pdf_coordinates,
# rotation_matrix=page_info.rotation_matrix,
# transformation_matrix=page_info.transformation_matrix,
# dpi=page_info.dpi,
# )
transform = partial(rescale_to_pdf, page_info=page_info)
logger.info(f"{offsets=}")
def apply_offsets(line: tuple) -> tuple:
x1, y1, x2, y2 = line
offset_x, offset_y = offsets
offset_y = page_info.height - offset_y # - (y2 * (y1 != y2))
logger.info((f"new offsets: {offset_x}, {offset_y}"))
return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y)
unpack = itemgetter("x1", "y1", "x2", "y2")
pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
convert = compose(pack, transform, unpack)
# convert = compose(pack, transform, apply_offsets, unpack)
convert = compose(pack, apply_offsets, transform, unpack)
# convert = compose(pack, transform, unpack)
table_lines = bboxes.get("tableLines", [])
transformed_lines = list(map(convert, table_lines))
bboxes["tableLines"] = transformed_lines
bboxes["tableLines"] = transformed_lines #lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
import json
for i in range(len(table_lines)):
logger.info(json.dumps(table_lines[i], indent=4))
logger.info(json.dumps(transformed_lines[i], indent=4))
logger.info('')
# exit()
return bboxes
@ -64,8 +107,20 @@ def extract_images_from_pdf(
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
page = fh[page_num] # pages[int(page_num)]
page.wrap_contents()
# TODO: Workaround to be able to transform the image coordinates to pdf coordinates in a later step.
current_page_info = PageInfo(page_num, page.rotation_matrix, page.transformation_matrix, dpi)
page_image = page.get_pixmap(dpi=200)
# import IPython; IPython.embed()
current_page_info = PageInfo(
page_num,
page.rotation_matrix,
page.transformation_matrix,
dpi,
*page.rect[-2:],
page_image.w,
page_image.h,
page.rotation,
)
for box_obj in boxes:
bbox = box_obj["box"]