From 443163864bab56930c2ef735c0aaafddd2561ead Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sat, 5 Feb 2022 17:59:03 +0100 Subject: [PATCH] implememted clean solution for parsing open tables. still needs final refactoring. --- scripts/annotate.py | 2 +- vidocp/table_parsing_2.py | 74 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 vidocp/table_parsing_2.py diff --git a/scripts/annotate.py b/scripts/annotate.py index 4c6d7b8..95de313 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,6 +1,6 @@ import argparse -from vidocp.table_parsig import annotate_tables_in_pdf +from vidocp.table_parsing_2 import annotate_tables_in_pdf from vidocp.redaction_detection import annotate_boxes_in_pdf from vidocp.layout_detection import annotate_layout_in_pdf diff --git a/vidocp/table_parsing_2.py b/vidocp/table_parsing_2.py new file mode 100644 index 0000000..8b035bf --- /dev/null +++ b/vidocp/table_parsing_2.py @@ -0,0 +1,74 @@ +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from pdf2image import pdf2image + + +def add_external_contours(image, img): + + contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) + + return image + + +def isolate_vertical_and_horizontal_components(img_bin): + + line_min_width = 30 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + + img_bin_final = img_bin_h | img_bin_v + + return img_bin_final + + +def annotate_image(image, stats): + + image = image.copy() + + for x, y, w, h, area in stats[2:]: + if w > 10 and h > 10: + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + + return image + + +def parse_table(image: np.array): + + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + img_bin = isolate_vertical_and_horizontal_components(img_bin) + img_bin_final = add_external_contours(img_bin, img_bin) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + + return stats + + +def annotate_tables_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + stats = parse_table(page) + page = annotate_image(page, stats) + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(page) + plt.show()