feat: remove relextrema because not working; use pure numpy instead

This commit is contained in:
iriley 2024-04-26 14:21:00 +02:00
parent 1d3b077ace
commit f7a0db2651
3 changed files with 145 additions and 84 deletions

View File

@ -1,12 +1,12 @@
from operator import itemgetter
from pathlib import Path from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional, Tuple
from typing import Tuple
import cv2 import cv2
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from kn_utils.logging import logger
from numpy import ndarray as Array from numpy import ndarray as Array
from scipy.signal import argrelextrema
from scipy.stats import norm from scipy.stats import norm
@ -38,14 +38,20 @@ def save_plot(arr: Array, name: str, title: str = "") -> None:
plt.savefig(Path(str(name) + ".png")) plt.savefig(Path(str(name) + ".png"))
def save_lines(img: Array, lines: list[dict[str, int]]) -> None:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
getter = itemgetter("x1", "y1", "x2", "y2")
for line in lines:
x1, y1, x2, y2 = getter(line)
img = cv2.line(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=3)
cv2.imwrite("/tmp/lines.png", img)
def make_gaussian_kernel(kernel_size: int, sd: float) -> Array: def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
kernel_size += int(not kernel_size % 2) kernel_size += int(not kernel_size % 2)
wing_size = int((kernel_size - 1) / 2) wing_size = int((kernel_size - 1) / 2)
xvals = np.arange(-wing_size, wing_size + 1) xvals = np.arange(-wing_size, wing_size + 1)
kernel = norm.pdf(xvals, scale=sd) kernel = norm.pdf(xvals, scale=sd)
# maxval, minval = np.max(kernel), np.min(kernel)
# diff = maxval - minval
# kernel += (diff / (1 - ratio))
kernel /= np.sum(kernel) kernel /= np.sum(kernel)
return kernel return kernel
@ -56,31 +62,21 @@ def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
wing_size = int((kernel_size - 1) / 2) wing_size = int((kernel_size - 1) / 2)
xvals = np.arange(-wing_size, wing_size + 1) xvals = np.arange(-wing_size, wing_size + 1)
kernel = norm.pdf(xvals, scale=sd) kernel = norm.pdf(xvals, scale=sd)
# maxval, minval = np.max(kernel), np.min(kernel)
# diff = maxval - minval
# kernel += (diff / (1 - ratio))
kernel /= np.sum(kernel) kernel /= np.sum(kernel)
return kernel return kernel
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array: def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
# print(bound)
# step_size = 2 * bound / (kernel_size - 1)
kernel_size += int(not kernel_size % 2) kernel_size += int(not kernel_size % 2)
# print(kernel_size)
wing_size = int((kernel_size - 1) / 2) wing_size = int((kernel_size - 1) / 2)
# print(step_size) kernel = np.array(
# xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1))) list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
# print(xvals) )
kernel = np.array(list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1))))
# print(kernel)
maxval, minval = np.max(kernel), np.min(kernel) maxval, minval = np.max(kernel), np.min(kernel)
diff = maxval - minval diff = maxval - minval
kernel += diff / (1 - ratio) kernel += diff / (1 - ratio)
# print(kernel)
kernel /= np.sum(kernel) kernel /= np.sum(kernel)
# print(kernel)
return kernel return kernel
@ -93,7 +89,8 @@ def min_avg_for_interval(filtered: Array, interval: int) -> float:
def search_intervals(filtered: Array, min_interval: int, max_interval: int): def search_intervals(filtered: Array, min_interval: int, max_interval: int):
performance = [ performance = [
(interval, *min_avg_for_interval(filtered, interval)) for interval in range(min_interval, max_interval + 1) (interval, *min_avg_for_interval(filtered, interval))
for interval in range(min_interval, max_interval + 1)
] ]
best = min(performance, key=lambda x: x[1]) best = min(performance, key=lambda x: x[1])
return best[0], best[2] return best[0], best[2]
@ -103,7 +100,7 @@ def filter_array(
array: Array, array: Array,
sum_filter: Array, sum_filter: Array,
padding: Optional[Array] = None, padding: Optional[Array] = None,
pad_value_function: Callable[[Array], float] = np.mean, pad_value_function: Callable[[Array], float] = lambda x: 255.0, # np.mean,
) -> Array: ) -> Array:
if sum_filter is None: if sum_filter is None:
return array return array
@ -116,24 +113,74 @@ def filter_array(
return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid") return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
ROW_FILTER1_WIDTH = 30
ROW_FILTER1_SD = 6
ROW_FILTER2_WIDTH = 20
ROW_FILTER2_SD = 4
COL_FILTER1_WIDTH = 90
COL_FILTER1_SD = 15
COL_FILTER2_WIDTH = 70
COL_FILTER2_SD = 12
COL_FILTER3_WIDTH = 200
COL_FILTER3_SD = 20
FILTERS = { FILTERS = {
"row": {1: make_gaussian_kernel(30, 6), 2: make_gaussian_kernel(20, 4)}, "row": {
"col": {1: make_gaussian_kernel(70, 10), 2: None}, 1: make_gaussian_kernel(ROW_FILTER1_WIDTH, ROW_FILTER1_SD),
2: make_gaussian_kernel(ROW_FILTER2_WIDTH, ROW_FILTER2_SD),
3: None,
},
"col": {
1: make_gaussian_kernel(COL_FILTER1_WIDTH, COL_FILTER1_SD),
2: make_gaussian_kernel(COL_FILTER2_WIDTH, COL_FILTER2_SD),
3: make_gaussian_kernel(COL_FILTER3_WIDTH, COL_FILTER3_SD),
},
} }
def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
centers = list(
np.where(
(filt_sums[1:-1] < filt_sums[:-2]) * (filt_sums[1:-1] < filt_sums[2:])
)[0]
+ 1
)
if line_list[0] > centers[0]:
centers = centers[1:] + [len(filt_sums) - 1]
mindiff = np.std(filt_sums)
line_list = [
maxidx
for maxidx, minidx in zip(line_list, centers)
if (filt_sums[maxidx] - filt_sums[minidx]) > mindiff
]
return line_list
def get_lines_either(table_array: Array, horizontal=True) -> Array: def get_lines_either(table_array: Array, horizontal=True) -> Array:
key = "row" if horizontal else "col" key = "row" if horizontal else "col"
THRESHOLD = 0.4
filters = FILTERS filters = FILTERS
sums = np.mean(table_array, axis=int(horizontal)) sums = np.mean(table_array, axis=int(horizontal))
sums = np.maximum(sums, (sums < THRESHOLD)) threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
# save_plot(rows, name=save_path / "rows", title="raw row averages") predicate = 1000.0 * (sums < threshold)
filtered_sums = filter_array(sums, FILTERS[key][1]) # ROW_FILTER1) sums = np.maximum(
np.maximum(sums[1:-1], predicate[1:-1]),
np.maximum(predicate[:-2], predicate[2:]),
)
filtered_sums = filter_array(sums, FILTERS[key][1])
filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
lines = list(
np.where(
(filtered_sums[1:-1] > filtered_sums[:-2])
* (filtered_sums[1:-1] > filtered_sums[2:])
)[0]
+ 1
)
if not horizontal: if not horizontal:
filtered_sums = filter_array(filtered_sums, FILTERS[key][2]) # ROW_FILTER2) lines = filter_fp_col_lines(lines, filtered_sums)
lines = argrelextrema(filtered_sums, np.greater)[0]
return lines return lines
@ -143,7 +190,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]: def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
# cv2.GaussianBlur(img,(15,5),cv2.BORDER_DEFAULT) cv2.imwrite("/tmp/table.png", img)
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
cv2.imwrite("/tmp/table_bin.png", img)
h, w = map(int, img.shape) h, w = map(int, img.shape)
row_vals = map(int, get_lines_either(img, horizontal=True)) row_vals = map(int, get_lines_either(img, horizontal=True))
col_vals = map(int, get_lines_either(img, horizontal=False)) col_vals = map(int, get_lines_either(img, horizontal=False))
@ -152,4 +201,6 @@ def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals {"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
] ]
save_lines(img, lines)
return {"tableLines": lines, "imageInfo": {"height": h, "width": w}} return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}

View File

@ -7,11 +7,12 @@ import fitz
from kn_utils.logging import logger from kn_utils.logging import logger
def annotate_pdf(pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None): def annotate_pdf(
pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None
):
pdf_bytes = provide_byte_stream(pdf) pdf_bytes = provide_byte_stream(pdf)
with fitz.open(stream=pdf_bytes) as pdf_handle: with fitz.open(stream=pdf_bytes) as pdf_handle:
for page_annotations in annotations: for page_annotations in annotations:
# FIXME: Adapt to line drawing
index = page_annotations["pageNum"] index = page_annotations["pageNum"]
annotate_page(pdf_handle[index], page_annotations) annotate_page(pdf_handle[index], page_annotations)
output_path = output_path or "/tmp/annotated.pdf" output_path = output_path or "/tmp/annotated.pdf"
@ -20,16 +21,21 @@ def annotate_pdf(pdf: Union[str, bytes, Path], annotations, output_path: Union[s
def annotate_page(page: fitz.Page, prediction): def annotate_page(page: fitz.Page, prediction):
# for box in prediction["boxes"]: for box in prediction.get("boxes", []):
# bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"]) bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
# label, probability, uuid = itemgetter("label", "probability", "uuid")(box) label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
# bbox = mirror_on_x_axis(bbox, page.bound().height) bbox = mirror_on_x_axis(bbox, page.bound().height)
# x0, y0, x1, y1 = bbox x0, y0, x1, y1 = bbox
# page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2) page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
# label_x, label_y = x0, y0 - 5 label_x, label_y = x0, y0 - 5
# page.insert_text((label_x, label_y), f"{label} ({probability:.2f}), {uuid}", fontsize=12, color=(0.4, 0.4, 1)) page.insert_text(
for line in prediction["tableLines"]: (label_x, label_y),
f"{label} ({probability:.2f}), {uuid}",
fontsize=12,
color=(0.4, 0.4, 1),
)
for line in prediction.get("tableLines", []):
start = itemgetter("x1", "y1")(line) start = itemgetter("x1", "y1")(line)
end = itemgetter("x2", "y2")(line) end = itemgetter("x2", "y2")(line)
page.draw_line(start, end, color=(1, 0, 0.5), width=1) page.draw_line(start, end, color=(1, 0, 0.5), width=1)

View File

@ -1,15 +1,13 @@
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial from functools import partial
from operator import itemgetter from operator import itemgetter
from typing import Iterable from typing import Iterable, Tuple
from typing import Tuple
import fitz import fitz
import numpy as np import numpy as np
from funcy import compose, lfilter from funcy import compose, lfilter
from numpy import ndarray as Array
from kn_utils.logging import logger from kn_utils.logging import logger
from numpy import ndarray as Array
@dataclass @dataclass
@ -23,12 +21,17 @@ class PageInfo:
image_width: int | float image_width: int | float
image_height: int | float image_height: int | float
rotation: int rotation: int
def transform_image_coordinates_to_pdf_coordinates( def transform_image_coordinates_to_pdf_coordinates(
bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None bbox: Iterable[int | float],
rotation_matrix: fitz.Matrix,
transformation_matrix: fitz.Matrix,
dpi: int = None,
) -> Tuple: ) -> Tuple:
x1, y1, x2, y2 = map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox # Convert to points, can be done before x1, y1, x2, y2 = (
map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox
) # Convert to points, can be done before
rect = fitz.Rect(x1, y1, x2, y2) rect = fitz.Rect(x1, y1, x2, y2)
rect = rect * rotation_matrix * transformation_matrix rect = rect * rotation_matrix * transformation_matrix
@ -42,51 +45,44 @@ def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable
pix_h, pix_w = page_info.image_height, page_info.image_width pix_h, pix_w = page_info.image_height, page_info.image_width
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
round3 = lambda x: tuple(map(lambda y: round(y, 3), x)) round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3((ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)) ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3(
new_bbox = round3((bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)) (ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)
# logger.info(f"{pdf_h=}, {pix_h=}, {pdf_w=}, {pix_w=}, {ratio_w=}, {ratio_h=}") )
# logger.info(round3(bbox)) new_bbox = round3(
# logger.info(new_bbox) (bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)
)
return new_bbox return new_bbox
def transform_table_lines_by_page_info(bboxes: dict, offsets: tuple, page_info: PageInfo) -> dict: def transform_table_lines_by_page_info(
# FIXME: Also convert image info? Is image info necessary? bboxes: dict, offsets: tuple, page_info: PageInfo
# Also, the resulting lines are not in the table bbox, is this okay? ) -> dict:
# transform = partial(
# transform_image_coordinates_to_pdf_coordinates,
# rotation_matrix=page_info.rotation_matrix,
# transformation_matrix=page_info.transformation_matrix,
# dpi=page_info.dpi,
# )
transform = partial(rescale_to_pdf, page_info=page_info) transform = partial(rescale_to_pdf, page_info=page_info)
logger.info(f"{offsets=}") logger.debug(f"{offsets=}")
def apply_offsets(line: tuple) -> tuple: def apply_offsets(line: tuple) -> tuple:
x1, y1, x2, y2 = line x1, y1, x2, y2 = line
offset_x, offset_y = offsets offset_x, offset_y = offsets
offset_y = page_info.height - offset_y # - (y2 * (y1 != y2)) offset_y = page_info.height - offset_y
logger.info((f"new offsets: {offset_x}, {offset_y}")) logger.debug((f"new offsets: {offset_x}, {offset_y}"))
return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y) return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y)
unpack = itemgetter("x1", "y1", "x2", "y2") unpack = itemgetter("x1", "y1", "x2", "y2")
pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]} pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
# convert = compose(pack, transform, apply_offsets, unpack)
convert = compose(pack, apply_offsets, transform, unpack) convert = compose(pack, apply_offsets, transform, unpack)
# convert = compose(pack, transform, unpack)
table_lines = bboxes.get("tableLines", []) table_lines = bboxes.get("tableLines", [])
transformed_lines = list(map(convert, table_lines)) transformed_lines = list(map(convert, table_lines))
bboxes["tableLines"] = transformed_lines #lfilter(lambda b: b['y1']==b['y2'], transformed_lines) bboxes[
"tableLines"
] = transformed_lines # lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
import json import json
for i in range(len(table_lines)): for i in range(len(table_lines)):
logger.info(json.dumps(table_lines[i], indent=4)) logger.debug(json.dumps(table_lines[i], indent=4))
logger.info(json.dumps(transformed_lines[i], indent=4)) logger.debug(json.dumps(transformed_lines[i], indent=4))
logger.info('') logger.debug("")
# exit()
return bboxes return bboxes
@ -106,11 +102,10 @@ def extract_images_from_pdf(
boxes = page_dict["boxes"] boxes = page_dict["boxes"]
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes) boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
page = fh[page_num] # pages[int(page_num)] page = fh[page_num]
page.wrap_contents() page.wrap_contents()
# TODO: Workaround to be able to transform the image coordinates to pdf coordinates in a later step.
page_image = page.get_pixmap(dpi=200) page_image = page.get_pixmap(dpi=200)
# import IPython; IPython.embed()
current_page_info = PageInfo( current_page_info = PageInfo(
page_num, page_num,
page.rotation_matrix, page.rotation_matrix,
@ -131,12 +126,21 @@ def extract_images_from_pdf(
# current_page_info object to include the derotation_matrix. # current_page_info object to include the derotation_matrix.
rect = rect * page.transformation_matrix * page.rotation_matrix rect = rect * page.transformation_matrix * page.rotation_matrix
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY) pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w) shape = (
(pixmap.h, pixmap.w, pixmap.n)
if pixmap.n > 1
else (pixmap.h, pixmap.w)
)
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape) image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
table_images.append(image) table_images.append(image)
table_info.append( table_info.append(
{"pageNum": page_num, "bbox": bbox, "uuid": box_obj["uuid"], "label": box_obj["label"]} {
"pageNum": page_num,
"bbox": bbox,
"uuid": box_obj["uuid"],
"label": box_obj["label"],
}
) )
page_info.append(current_page_info) page_info.append(current_page_info)