cv-analysis-service/vidocp/table_parsing.py
Matthias Bisping 00748a8ac0 Pull request #5: Table parsing version 2
Merge in RR/vidocp from table_parsing_version_2 to master

Squashed commit of the following:

commit af136ca10cf96f99699e409000ff598ce90c192e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:34:01 2022 +0100

    readme updated

commit 13ca7b1b03cb2bf7b3c8ef5821c1f8fa9ec532a0
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:32:11 2022 +0100

    drawing color standardized

commit 654e961c62ddc0f512074e8238d7fa88f0ea227e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:22:57 2022 +0100

    refactoring

commit 964c17a36f7bbc1376dfe68f4ea90462d676e215
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:07:16 2022 +0100

    readme updated

commit 4470969b35bb76e68cc41947fa02e63100b30ce9
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:05:35 2022 +0100

    readme updated

commit a6c6bdb1e71a778a3c21a628cfb30acc5bc6086f
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:05:21 2022 +0100

    readme updated

commit e178793dd69b720adefe7533312314e4c405f975
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:03:45 2022 +0100

    readme updated

commit 443163864bab56930c2ef735c0aaafddd2561ead
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 17:59:03 2022 +0100

    implememted clean solution for parsing open tables. still needs final refactoring.
2022-02-05 19:32:47 +01:00

56 lines
1.5 KiB
Python

import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils import show_cv2, draw_stats
def add_external_contours(image, img):
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin):
line_min_width = 30
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
return img_bin_final
def parse_table(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)
img_bin_final = add_external_contours(img_bin, img_bin)
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
return stats
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
stats = parse_table(page)
page = draw_stats(page, stats)
show_cv2(page)