Julius Unverfehrt ce9e92876c Pull request #16: Add table parsing fixtures
Merge in RR/cv-analysis from add_table_parsing_fixtures to master

Squashed commit of the following:

commit cfc89b421b61082c8e92e1971c9d0bf4490fa07e
Merge: a7ecb05 73c66a8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Jul 11 12:19:01 2022 +0200

    Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into add_table_parsing_fixtures

commit a7ecb05b7d8327f0c7429180f63a380b61b06bc3
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Jul 11 12:02:07 2022 +0200

    refactor

commit 466f217e5a9ee5c54fd38c6acd28d54fc38ff9bb
Author: llocarnini <lillian.locarnini@iqser.com>
Date:   Mon Jul 11 10:24:14 2022 +0200

    deleted unused imports and unused lines of code

commit c58955c8658d0631cdd1c24c8556d399e3fd9990
Author: llocarnini <lillian.locarnini@iqser.com>
Date:   Mon Jul 11 10:16:01 2022 +0200

    black reformatted files

commit f8bcb10a00ff7f0da49b80c1609b17997411985a
Author: llocarnini <lillian.locarnini@iqser.com>
Date:   Tue Jul 5 15:15:00 2022 +0200

    reformat files

commit 432e8a569fd70bd0745ce0549c2bfd2f2e907763
Author: llocarnini <lillian.locarnini@iqser.com>
Date:   Tue Jul 5 15:08:22 2022 +0200

    added better test for generic pages with table WIP as thicker lines create inconsistent results.
    added test for patchy tables which does not work yet

commit 2aac9ebf5c76bd963f8c136fe5dd4c2d7681b469
Author: llocarnini <lillian.locarnini@iqser.com>
Date:   Mon Jul 4 16:56:29 2022 +0200

    added new fixtures for table_parsing_test.py

commit 37606cac0301b13e99be2c16d95867477f29e7c4
Author: llocarnini <lillian.locarnini@iqser.com>
Date:   Fri Jul 1 16:02:44 2022 +0200

    added separate file for table parsing fixtures, where fixtures for generic tables were added. WIP tests for generic table fixtures
2022-07-11 12:25:16 +02:00

153 lines
4.6 KiB
Python

from collections import namedtuple
from functools import partial
from itertools import starmap, compress
def remove_overlapping(rectangles):
def overlap(a, b):
return compute_intersection(a, b) > 0
def does_not_overlap(rect, rectangles):
return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2)
rectangles = list(map(xywh_to_vec_rect, rectangles))
rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles
def remove_included(rectangles):
def included(a, b):
return (
b.xmin >= a.xmin
and b.ymin >= a.ymin
and b.xmax <= a.xmax
and b.ymax <= a.ymax
)
def includes(a, b, tol=3):
"""does a include b?"""
return (
b.xmin + tol >= a.xmin
and b.ymin + tol >= a.ymin
and b.xmax - tol <= a.xmax
and b.ymax - tol <= a.ymax
)
def is_not_included(rect, rectangles):
return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)
rectangles = list(map(xywh_to_vec_rect, rectangles))
rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles
# tolerance was set too low (1) most lines are 2px wide
def adjacent1d(n, m, tolerance=4):
return abs(n - m) <= tolerance
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
def adjacent(a, b):
"""Two rects (v1, v2), (w1, w2) are adjacent if either of:
- the x components of v2 and w1 match and the y components of w1 or w2 are in the range of the y components of v1 and v2
- the x components of v1 and w2 match and the y components of w1 or w2 are in the range of the y components of v1 and v2
- the y components of v2 and w1 match and the x components of w1 or w2 are in the range of the x components of v1 and v2
- the y components of v1 and w2 match and the x components of w1 or w2 are in the range of the x components of v1 and v2
"""
def adjacent2d(g, h, i, j, k, l):
# print(adjacent1d(g, h), any(k <= p <= l for p in [i, j]))
return adjacent1d(g, h) and any(k <= p <= l for p in [i, j])
if any(x is None for x in (a, b)):
return False
v1 = a.xmin, a.ymin
v2 = a.xmax, a.ymax
w1 = b.xmin, b.ymin
w2 = b.xmax, b.ymax
return any(
(
adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
)
)
# FIXME: For some reason some isolated rects remain.
def __remove_isolated_unsorted(rectangles):
def is_connected(rect, rectangles):
return any(adjacent(r2, rect) for r2 in rectangles if not rect == r2)
rectangles = list(map(xywh_to_vec_rect, rectangles))
rectangles = filter(partial(is_connected, rectangles=rectangles), rectangles)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles
def make_box(x1, y1, x2, y2):
keys = "x1", "y1", "x2", "y2"
return dict(zip(keys, [x1, y1, x2, y2]))
def __remove_isolated_sorted(rectangles):
def is_connected(left, center, right):
# print(left,center,right)
return any(starmap(adjacent, [(left, center), (center, right)]))
rectangles = list(map(xywh_to_vec_rect, rectangles))
lefts = [None, *rectangles[:-1]]
rights = [*rectangles[1:], None]
mask = starmap(is_connected, zip(lefts, rectangles, rights))
rectangles = compress(rectangles, mask)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles
def remove_isolated(rectangles, input_sorted=False):
return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(
rectangles
)
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
def compute_intersection(a, b):
dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
return dx * dy if (dx >= 0) and (dy >= 0) else 0
def has_no_parent(hierarchy):
return hierarchy[-1] <= 0
def xywh_to_vec_rect(rect):
v1, v2 = xywh_to_vecs(rect)
return Rectangle(*v1, *v2)
def vecs_to_vec_rect(rect):
v1, v2 = rect
return Rectangle(*v1, *v2)
def xywh_to_vecs(rect):
x1, y1, w, h = rect
x2 = x1 + w
y2 = y1 + h
return (x1, y1), (x2, y2)
def vec_rect_to_xywh(rect):
x, y, x2, y2 = rect
w = x2 - x
h = y2 - y
return x, y, w, h