format and add functions in post_processing.py missing from merge
This commit is contained in:
parent
fc4789101f
commit
44d4eb5a98
@ -4,7 +4,7 @@ import numpy as np
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
|
||||
from vidocp.utils.deskew import deskew_histbased#, deskew_linebased
|
||||
from vidocp.utils.deskew import deskew_histbased # , deskew_linebased
|
||||
from vidocp.utils.display import show_mpl
|
||||
from vidocp.utils.draw import draw_stats
|
||||
from vidocp.table_parsing import parse_table
|
||||
@ -27,8 +27,8 @@ if __name__ == "__main__":
|
||||
page = np.array(page)
|
||||
|
||||
show_mpl(page)
|
||||
#page_ = deskew_linebased(page, verbose=True)
|
||||
#show_mpl(page_)
|
||||
# page_ = deskew_linebased(page, verbose=True)
|
||||
# show_mpl(page_)
|
||||
page_corr = deskew_histbased(page, verbose=True)
|
||||
show_mpl(page_corr)
|
||||
if args.save_path:
|
||||
@ -36,7 +36,7 @@ if __name__ == "__main__":
|
||||
page_.save(args.save_path.replace(".pdf", "_uncorrected.pdf"))
|
||||
page_corr_ = Image.fromarray(page_corr).convert("RGB")
|
||||
page_corr_.save(args.save_path.replace(".pdf", "_corrected.pdf"))
|
||||
#annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
|
||||
# annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
|
||||
stats = parse_table(page)
|
||||
page = draw_stats(page, stats)
|
||||
show_mpl(page)
|
||||
@ -48,4 +48,3 @@ if __name__ == "__main__":
|
||||
page.save(args.save_path.replace(".pdf", "_uncorrected_annotated.pdf"))
|
||||
page_corr = Image.fromarray(page_corr).convert("RGB")
|
||||
page_corr.save(args.save_path.replace(".pdf", "_corrected_annotated.pdf"))
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from functools import partial
|
||||
from itertools import chain, starmap
|
||||
from itertools import chain, compress, starmap
|
||||
from operator import attrgetter
|
||||
|
||||
import cv2
|
||||
@ -40,13 +40,13 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
|
||||
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
|
||||
show_mpl(img_bin_h | img_bin_v)
|
||||
|
||||
#reduced filtersize from 100 to 80 to minimize splitting narrow cells
|
||||
# reduced filtersize from 100 to 80 to minimize splitting narrow cells
|
||||
img_bin_h = apply_motion_blur(img_bin_h, 80, 0)
|
||||
img_bin_v = apply_motion_blur(img_bin_v, 80, 90)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
show_mpl(img_bin_final)
|
||||
#changed threshold from 110 to 120 to minimize cell splitting
|
||||
# changed threshold from 110 to 120 to minimize cell splitting
|
||||
th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
|
||||
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
|
||||
show_mpl(img_bin_final)
|
||||
@ -121,7 +121,7 @@ def parse_table(image: np.array):
|
||||
x1, y1, w, h, area = stat
|
||||
return area > 2000 and w > 35 and h > 25
|
||||
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape)>2 else image
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
show_mpl(img_bin)
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
from itertools import starmap, compress
|
||||
|
||||
|
||||
def remove_overlapping(rectangles):
|
||||
@ -19,8 +20,13 @@ def remove_included(rectangles):
|
||||
def included(a, b):
|
||||
return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax
|
||||
|
||||
def includes(a, b, tol=3):
|
||||
"""does a include b?"""
|
||||
return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax
|
||||
|
||||
def is_not_included(rect, rectangles):
|
||||
return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
|
||||
return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)
|
||||
|
||||
rectangles = list(map(xywh_to_vec_rect, rectangles))
|
||||
rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
|
||||
@ -28,19 +34,82 @@ def remove_included(rectangles):
|
||||
return rectangles
|
||||
|
||||
|
||||
# tolerance was set too low (1) most lines are 2px wide
|
||||
def adjacent1d(n, m, tolerance=4):
|
||||
return abs(n - m) <= tolerance
|
||||
|
||||
|
||||
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
|
||||
|
||||
|
||||
def adjacent(a, b):
|
||||
"""Two rects (v1, v2), (w1, w2) are adjacent if either of:
|
||||
- the x components of v2 and w1 match and the y components of w1 or w2 are in the range of the y components of v1 and v2
|
||||
- the x components of v1 and w2 match and the y components of w1 or w2 are in the range of the y components of v1 and v2
|
||||
- the y components of v2 and w1 match and the x components of w1 or w2 are in the range of the x components of v1 and v2
|
||||
- the y components of v1 and w2 match and the x components of w1 or w2 are in the range of the x components of v1 and v2
|
||||
"""
|
||||
|
||||
def adjacent2d(g, h, i, j, k, l):
|
||||
# print(adjacent1d(g, h), any(k <= p <= l for p in [i, j]))
|
||||
return adjacent1d(g, h) and any(k <= p <= l for p in [i, j])
|
||||
|
||||
if any(x is None for x in (a, b)):
|
||||
return False
|
||||
v1 = a.xmin, a.ymin
|
||||
v2 = a.xmax, a.ymax
|
||||
w1 = b.xmin, b.ymin
|
||||
w2 = b.xmax, b.ymax
|
||||
return any(
|
||||
(
|
||||
adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
|
||||
adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
|
||||
adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
|
||||
adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# FIXME: For some reason some isolated rects remain.
|
||||
def __remove_isolated_unsorted(rectangles):
|
||||
def is_connected(rect, rectangles):
|
||||
return any(adjacent(r2, rect) for r2 in rectangles if not rect == r2)
|
||||
|
||||
rectangles = list(map(xywh_to_vec_rect, rectangles))
|
||||
rectangles = filter(partial(is_connected, rectangles=rectangles), rectangles)
|
||||
rectangles = map(vec_rect_to_xywh, rectangles)
|
||||
return rectangles
|
||||
|
||||
|
||||
def make_box(x1, y1, x2, y2):
|
||||
keys = "x1", "y1", "x2", "y2"
|
||||
return dict(zip(keys, [x1, y1, x2, y2]))
|
||||
|
||||
|
||||
def compute_intersection(a, b):
|
||||
def __remove_isolated_sorted(rectangles):
|
||||
def is_connected(left, center, right):
|
||||
# print(left,center,right)
|
||||
return any(starmap(adjacent, [(left, center), (center, right)]))
|
||||
|
||||
rectangles = list(map(xywh_to_vec_rect, rectangles))
|
||||
lefts = [None, *rectangles[:-1]]
|
||||
rights = [*rectangles[1:], None]
|
||||
mask = starmap(is_connected, zip(lefts, rectangles, rights))
|
||||
rectangles = compress(rectangles, mask)
|
||||
rectangles = map(vec_rect_to_xywh, rectangles)
|
||||
return rectangles
|
||||
|
||||
|
||||
def remove_isolated(rectangles, input_sorted=False):
|
||||
return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(rectangles)
|
||||
|
||||
|
||||
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
|
||||
|
||||
|
||||
def compute_intersection(a, b):
|
||||
dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
|
||||
dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
|
||||
|
||||
return dx * dy if (dx >= 0) and (dy >= 0) else 0
|
||||
|
||||
|
||||
@ -49,10 +118,21 @@ def has_no_parent(hierarchy):
|
||||
|
||||
|
||||
def xywh_to_vec_rect(rect):
|
||||
v1, v2 = xywh_to_vecs(rect)
|
||||
return Rectangle(*v1, *v2)
|
||||
|
||||
|
||||
def vecs_to_vec_rect(rect):
|
||||
v1, v2 = rect
|
||||
return Rectangle(*v1, *v2)
|
||||
|
||||
|
||||
def xywh_to_vecs(rect):
|
||||
x1, y1, w, h = rect
|
||||
x2 = x1 + w
|
||||
y2 = y1 + h
|
||||
return Rectangle(x1, y1, x2, y2)
|
||||
return (x1, y1), (x2, y2)
|
||||
|
||||
|
||||
def vec_rect_to_xywh(rect):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user