fix: coordinate remapping
This commit is contained in:
parent
0f0fe516d0
commit
102617fe2f
@ -1,5 +1,5 @@
|
||||
# cv-analysis — Visual (CV-Based) Document Parsing
|
||||
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
|
||||
@ -2,14 +2,17 @@ import json
|
||||
|
||||
from cv_analysis.server.pipeline import make_image_analysis_pipeline
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
from cv_analysis.utils.annotate import annotate_pdf
|
||||
|
||||
pipe = make_image_analysis_pipeline(infer_lines)
|
||||
|
||||
# FIXME: Implement argparsing
|
||||
|
||||
pdf_bytes = open("/home/junverfehrt/Documents/rosario_test_file.pdf", "rb").read()
|
||||
vlp_output = json.load(open("/home/junverfehrt/Documents/rosario_test_file_vlp.json", "r"))
|
||||
pdf_bytes = open("test/test_data/article.pdf", "rb").read()
|
||||
vlp_output = json.load(open("test/test_data/article.json", "r"))
|
||||
|
||||
best_result = list(pipe(data={"pdf": pdf_bytes, "vlp_output": vlp_output}))
|
||||
|
||||
print(best_result)
|
||||
# print(best_result)
|
||||
|
||||
annotate_pdf(pdf_bytes, best_result, output_path = "/tmp/deine-mutter.pdf")
|
||||
@ -1,5 +1,5 @@
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
from operator import itemgetter, truth
|
||||
from typing import Generator, Callable
|
||||
|
||||
from funcy import flatten, lmap
|
||||
@ -46,8 +46,15 @@ def make_image_analysis_pipeline(
|
||||
pdf_bytes = data["pdf"]
|
||||
vlp_output = data["vlp_output"]
|
||||
images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
||||
# rel_bboxes = map()
|
||||
img_results = lmap(analysis_fn, images)
|
||||
img_results = lmap(transform_table_lines_by_page_info, img_results, page_info)
|
||||
def make_offsets():
|
||||
...
|
||||
|
||||
offsets = map(itemgetter("x1", "y2"), map(itemgetter("bbox"), info))
|
||||
# print("before", img_results)
|
||||
img_results = lmap(transform_table_lines_by_page_info, img_results, offsets, page_info)
|
||||
# print("after", img_results)
|
||||
results = map(lambda i: info[i] | img_results[i], range(len(info)))
|
||||
|
||||
yield from results
|
||||
|
||||
@ -124,14 +124,15 @@ FILTERS = {
|
||||
|
||||
def get_lines_either(table_array: Array, horizontal=True) -> Array:
|
||||
key = "row" if horizontal else "col"
|
||||
THRESHOLD = 0.3
|
||||
THRESHOLD = 0.4
|
||||
|
||||
filters = FILTERS
|
||||
sums = np.mean(table_array, axis=int(horizontal))
|
||||
sums = np.maximum(sums, (sums < THRESHOLD))
|
||||
# save_plot(rows, name=save_path / "rows", title="raw row averages")
|
||||
filtered_sums = filter_array(sums, FILTERS[key][1]) # ROW_FILTER1)
|
||||
filtered_sums = filter_array(sums, FILTERS[key][2]) # ROW_FILTER2)
|
||||
if not horizontal:
|
||||
filtered_sums = filter_array(filtered_sums, FILTERS[key][2]) # ROW_FILTER2)
|
||||
lines = argrelextrema(filtered_sums, np.greater)[0]
|
||||
return lines
|
||||
|
||||
@ -142,6 +143,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
|
||||
|
||||
|
||||
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
|
||||
# cv2.GaussianBlur(img,(15,5),cv2.BORDER_DEFAULT)
|
||||
h, w = map(int, img.shape)
|
||||
row_vals = map(int, get_lines_either(img, horizontal=True))
|
||||
col_vals = map(int, get_lines_either(img, horizontal=False))
|
||||
|
||||
@ -7,29 +7,32 @@ import fitz
|
||||
from kn_utils.logging import logger
|
||||
|
||||
|
||||
def annotate_pdf(pdf: Union[str, bytes, Path], predictions, output_path: Union[str, Path] = None):
|
||||
def annotate_pdf(pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None):
|
||||
pdf_bytes = provide_byte_stream(pdf)
|
||||
with fitz.open(stream=pdf_bytes) as pdf_handle:
|
||||
for prediction in predictions:
|
||||
for page_annotations in annotations:
|
||||
# FIXME: Adapt to line drawing
|
||||
index = prediction["page_idx"]
|
||||
annotate_page(pdf_handle[index], prediction)
|
||||
index = page_annotations["pageNum"]
|
||||
annotate_page(pdf_handle[index], page_annotations)
|
||||
output_path = output_path or "/tmp/annotated.pdf"
|
||||
pdf_handle.save(output_path)
|
||||
logger.info(f"Annotated PDF saved to {output_path}")
|
||||
|
||||
|
||||
def annotate_page(page: fitz.Page, prediction):
|
||||
for box in prediction["boxes"]:
|
||||
bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
|
||||
label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
|
||||
|
||||
bbox = mirror_on_x_axis(bbox, page.bound().height)
|
||||
x0, y0, x1, y1 = bbox
|
||||
page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
|
||||
label_x, label_y = x0, y0 - 5
|
||||
page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1))
|
||||
# for box in prediction["boxes"]:
|
||||
# bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
|
||||
# label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
|
||||
|
||||
# bbox = mirror_on_x_axis(bbox, page.bound().height)
|
||||
# x0, y0, x1, y1 = bbox
|
||||
# page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
|
||||
# label_x, label_y = x0, y0 - 5
|
||||
# page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1))
|
||||
for line in prediction["tableLines"]:
|
||||
start = itemgetter("x1", "y1")(line)
|
||||
end = itemgetter("x2", "y2")(line)
|
||||
page.draw_line(start, end)
|
||||
return page
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,11 @@
|
||||
import os
|
||||
import cv2
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
if os.environ["USER"] == "isaac":
|
||||
import matplotlib
|
||||
matplotlib.use('module://matplotlib-backend-wezterm')
|
||||
|
||||
|
||||
def show_image_cv2(image, maxdim=700):
|
||||
h, w, c = image.shape
|
||||
|
||||
@ -6,9 +6,11 @@ from typing import Tuple
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
from funcy import compose
|
||||
from funcy import compose, lfilter
|
||||
from numpy import ndarray as Array
|
||||
|
||||
from kn_utils.logging import logger
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageInfo:
|
||||
@ -16,6 +18,11 @@ class PageInfo:
|
||||
rotation_matrix: fitz.Matrix
|
||||
transformation_matrix: fitz.Matrix
|
||||
dpi: int
|
||||
width: int | float
|
||||
height: int | float
|
||||
image_width: int | float
|
||||
image_height: int | float
|
||||
rotation: int
|
||||
|
||||
|
||||
def transform_image_coordinates_to_pdf_coordinates(
|
||||
@ -28,22 +35,58 @@ def transform_image_coordinates_to_pdf_coordinates(
|
||||
return rect.x0, rect.y0, rect.x1, rect.y1
|
||||
|
||||
|
||||
def transform_table_lines_by_page_info(bboxes: dict, page_info: PageInfo) -> dict:
|
||||
def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable[float]:
|
||||
pdf_h, pdf_w = page_info.height, page_info.width
|
||||
if page_info.rotation in {90, 270}:
|
||||
pdf_h, pdf_w = pdf_w, pdf_h
|
||||
pix_h, pix_w = page_info.image_height, page_info.image_width
|
||||
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
|
||||
round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
|
||||
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3((ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h))
|
||||
new_bbox = round3((bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h))
|
||||
# logger.info(f"{pdf_h=}, {pix_h=}, {pdf_w=}, {pix_w=}, {ratio_w=}, {ratio_h=}")
|
||||
# logger.info(round3(bbox))
|
||||
# logger.info(new_bbox)
|
||||
return new_bbox
|
||||
|
||||
|
||||
def transform_table_lines_by_page_info(bboxes: dict, offsets: tuple, page_info: PageInfo) -> dict:
|
||||
# FIXME: Also convert image info? Is image info necessary?
|
||||
# Also, the resulting lines are not in the table bbox, is this okay?
|
||||
transform = partial(
|
||||
transform_image_coordinates_to_pdf_coordinates,
|
||||
rotation_matrix=page_info.rotation_matrix,
|
||||
transformation_matrix=page_info.transformation_matrix,
|
||||
dpi=page_info.dpi,
|
||||
)
|
||||
|
||||
# transform = partial(
|
||||
# transform_image_coordinates_to_pdf_coordinates,
|
||||
# rotation_matrix=page_info.rotation_matrix,
|
||||
# transformation_matrix=page_info.transformation_matrix,
|
||||
# dpi=page_info.dpi,
|
||||
# )
|
||||
|
||||
transform = partial(rescale_to_pdf, page_info=page_info)
|
||||
logger.info(f"{offsets=}")
|
||||
|
||||
def apply_offsets(line: tuple) -> tuple:
|
||||
x1, y1, x2, y2 = line
|
||||
offset_x, offset_y = offsets
|
||||
offset_y = page_info.height - offset_y # - (y2 * (y1 != y2))
|
||||
logger.info((f"new offsets: {offset_x}, {offset_y}"))
|
||||
|
||||
return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y)
|
||||
|
||||
unpack = itemgetter("x1", "y1", "x2", "y2")
|
||||
pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
|
||||
convert = compose(pack, transform, unpack)
|
||||
# convert = compose(pack, transform, apply_offsets, unpack)
|
||||
convert = compose(pack, apply_offsets, transform, unpack)
|
||||
# convert = compose(pack, transform, unpack)
|
||||
|
||||
table_lines = bboxes.get("tableLines", [])
|
||||
transformed_lines = list(map(convert, table_lines))
|
||||
bboxes["tableLines"] = transformed_lines
|
||||
bboxes["tableLines"] = transformed_lines #lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
|
||||
import json
|
||||
for i in range(len(table_lines)):
|
||||
logger.info(json.dumps(table_lines[i], indent=4))
|
||||
logger.info(json.dumps(transformed_lines[i], indent=4))
|
||||
logger.info('')
|
||||
# exit()
|
||||
|
||||
return bboxes
|
||||
|
||||
@ -64,8 +107,20 @@ def extract_images_from_pdf(
|
||||
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
|
||||
|
||||
page = fh[page_num] # pages[int(page_num)]
|
||||
page.wrap_contents()
|
||||
# TODO: Workaround to be able to transform the image coordinates to pdf coordinates in a later step.
|
||||
current_page_info = PageInfo(page_num, page.rotation_matrix, page.transformation_matrix, dpi)
|
||||
page_image = page.get_pixmap(dpi=200)
|
||||
# import IPython; IPython.embed()
|
||||
current_page_info = PageInfo(
|
||||
page_num,
|
||||
page.rotation_matrix,
|
||||
page.transformation_matrix,
|
||||
dpi,
|
||||
*page.rect[-2:],
|
||||
page_image.w,
|
||||
page_image.h,
|
||||
page.rotation,
|
||||
)
|
||||
|
||||
for box_obj in boxes:
|
||||
bbox = box_obj["box"]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user