73 lines
2.0 KiB
Python
73 lines
2.0 KiB
Python
from functools import singledispatch
|
|
from operator import itemgetter
|
|
from pathlib import Path
|
|
from typing import Union
|
|
|
|
import fitz # type: ignore
|
|
from kn_utils.logging import logger
|
|
|
|
from cv_analysis.utils.image_extraction import mirror_horizontal # type: ignore
|
|
|
|
|
|
def annotate_pdf(
|
|
pdf: Union[str, bytes, Path],
|
|
annotations,
|
|
output_path: Union[str, Path, None] = None,
|
|
):
|
|
pdf_bytes = provide_byte_stream(pdf)
|
|
with fitz.open(stream=pdf_bytes) as pdf_handle:
|
|
for page_annotations in annotations:
|
|
index = page_annotations["pageNum"]
|
|
annotate_page(pdf_handle[index], page_annotations)
|
|
output_path = output_path or "/tmp/annotated.pdf"
|
|
pdf_handle.save(output_path)
|
|
logger.info(f"Annotated PDF saved to {output_path}")
|
|
|
|
|
|
def annotate_page(page: fitz.Page, prediction):
|
|
for box in prediction.get("boxes", []):
|
|
bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
|
|
label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
|
|
|
|
x0, y0, x1, y1 = bbox
|
|
|
|
print(page.bound)
|
|
page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
|
|
label_x, label_y = x0, y0 - 5
|
|
page.insert_text(
|
|
(label_x, label_y),
|
|
f"{label} ({probability:.2f}), {uuid}",
|
|
fontsize=12,
|
|
color=(0.4, 0.4, 1),
|
|
)
|
|
for line in prediction.get("tableLines", []):
|
|
start = itemgetter("x1", "y1")(line)
|
|
end = itemgetter("x2", "y2")(line)
|
|
|
|
bbox = (*start, *end)
|
|
height = page.bound()[3]
|
|
bbox = mirror_horizontal(bbox, page_height=height)
|
|
|
|
start = tuple(bbox[:2])
|
|
end = tuple(bbox[2:])
|
|
|
|
page.draw_line(start, end, color=(1, 0, 0.5), width=1)
|
|
return page
|
|
|
|
|
|
@singledispatch
|
|
def provide_byte_stream(pdf: Union[bytes, Path, str]) -> None:
|
|
pass
|
|
|
|
|
|
@provide_byte_stream.register(bytes)
|
|
def _(pdf):
|
|
return pdf
|
|
|
|
|
|
@provide_byte_stream.register(str)
|
|
@provide_byte_stream.register(Path)
|
|
def _(pdf):
|
|
with open(pdf, "rb") as pdf_file:
|
|
return pdf_file.read()
|