From 1cf8508dc36ea4ffda1db22d5081ae9b66f39eef Mon Sep 17 00:00:00 2001 From: llocarnini Date: Mon, 24 Jan 2022 16:55:29 +0100 Subject: [PATCH 01/19] changes of parameters in table parsing: l15 line_min_width = 5 so no cell is missing in tables, l37 bigger min. rectangle so no text will be detected as table --- .gitignore | 19 +++++++++++++++++++ table_parsing/table_parsig.py | 12 +++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e532dc9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +/pdfs_syngenta/2f9bd062b382f3820a43caa993d94bb5.pdf +/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf +/pdfs_syngenta/23c935f9bf704395a214ddd22af45932.pdf +/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf +/pdfs_syngenta/77cc1f61872bf7e5d65836e24449fa35.pdf +/pdfs_syngenta/1797686d2dc44e2e123877eddb5ee00b.pdf +/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf +/pdfs_syngenta/a9648c5ef39fb6a5596d9496349452b4.pdf +/pdfs_syngenta/bbf150588d2d5c213d358fe24179c71a.pdf +/pdfs_syngenta/bfd3cf846f724bd924bc3d148057f99e.pdf +/results/morph_operator_close.png +/results/morph_operator_open.png +/env/ +/home/lillian/table_parsing/.idea/ +/.idea/.gitignore +/.idea/misc.xml +/.idea/inspectionProfiles/profiles_settings.xml +/.idea/table_parsing.iml +/.idea/vcs.xml diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index c6d0306..8473d0d 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -12,12 +12,14 @@ def parse(image: np.array): th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) img_bin = ~img_bin - line_min_width = 4 + line_min_width = 5 kernel_h = np.ones((1, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 1), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + # img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_h) + # img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_v) img_bin_final = img_bin_h | img_bin_v @@ -32,7 +34,7 @@ def parse_tables_in_pdf(pages): def annotate_image(image, stats): for x, y, w, h, area in stats[2:]: - if w > 10 and h > 10: + if w > 35 and h > 13: cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): @@ -53,6 +55,10 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): page = annotate_image(page, stats) fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) + fig.set_size_inches(20 , 20) ax.imshow(page) plt.show() + + +annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4) +annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf", 11) From 6f346a6cadf41d3b15f1feb9c114f271d3253298 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 26 Jan 2022 09:42:59 +0100 Subject: [PATCH 02/19] changes so no single rectangle is parsed as table in table_parsig.py --- .gitignore | 15 ++------ table_parsing/table_parsig.py | 72 +++++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index e532dc9..e399835 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,4 @@ -/pdfs_syngenta/2f9bd062b382f3820a43caa993d94bb5.pdf -/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf -/pdfs_syngenta/23c935f9bf704395a214ddd22af45932.pdf -/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf -/pdfs_syngenta/77cc1f61872bf7e5d65836e24449fa35.pdf -/pdfs_syngenta/1797686d2dc44e2e123877eddb5ee00b.pdf -/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf -/pdfs_syngenta/a9648c5ef39fb6a5596d9496349452b4.pdf -/pdfs_syngenta/bbf150588d2d5c213d358fe24179c71a.pdf -/pdfs_syngenta/bfd3cf846f724bd924bc3d148057f99e.pdf -/results/morph_operator_close.png -/results/morph_operator_open.png +/results/before/morph_operator_open.png /env/ /home/lillian/table_parsing/.idea/ /.idea/.gitignore @@ -17,3 +6,5 @@ /.idea/inspectionProfiles/profiles_settings.xml /.idea/table_parsing.iml /.idea/vcs.xml +/pdfs/ +/results/ diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 8473d0d..7f558bd 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -7,7 +7,6 @@ from matplotlib import pyplot as plt def parse(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) img_bin = ~img_bin @@ -18,8 +17,6 @@ def parse(image: np.array): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - # img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_h) - # img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_v) img_bin_final = img_bin_h | img_bin_v @@ -28,37 +25,72 @@ def parse(image: np.array): return labels, stats -def parse_tables_in_pdf(pages): - return zip(map(parse, pages), count()) - - def annotate_image(image, stats): - for x, y, w, h, area in stats[2:]: - if w > 35 and h > 13: - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + for i in range(2, len(stats)): + x,y,w,h,area = stats[i][0],stats[i][1],stats[i][2],stats[i][3],stats[i][4] + if w > 35 and h > 13 and area > 500: + #print(stats[i]) + if y == stats[i-1][1] or y == stats[i+1][1]: + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) - for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) return image +def parse_tables_in_pdf(pages): + return zip(map(parse, pages), count()) + +# def parse(image: np.array): +# gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) +# th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) +# img_bin = ~img_bin +# +# line_min_width = 4 +# kernel_h = np.ones((1, line_min_width), np.uint8) +# kernel_v = np.ones((line_min_width, 1), np.uint8) +# +# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) +# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) +# +# img_bin_final = img_bin_h | img_bin_v +# +# _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) +# +# return labels, stats +# +# def annotate_image(image, stats): +# for x, y, w, h, area in stats[2:]: +# if w > 10 and h > 10: +# cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) +# +# for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): +# anno = f"{s} = {v}" +# xann = int(x + 5) +# yann = int(y + h - (20 * (i + 1))) +# cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) +# +# return image + def annotate_tables_in_pdf(pdf_path, page_index=1): - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) _, stats = parse(page) page = annotate_image(page, stats) - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20 , 20) + fig.set_size_inches(20, 20) ax.imshow(page) plt.show() -annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4) -annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf", 11) +#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4) +#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf", 8) +annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Quartz sand_RAR_13_Volume_3CP_Repentol6PA_B2_2021_03_24.pdf", 6) +# annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 24) +# annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 5) +#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 16) \ No newline at end of file From ba32b3bcbed2021fe8a402fd4fa4971767167695 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 26 Jan 2022 10:47:41 +0100 Subject: [PATCH 03/19] deleted some unused line of code --- table_parsing/table_parsig.py | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 7f558bd..e90b8b9 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -44,36 +44,6 @@ def annotate_image(image, stats): def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) -# def parse(image: np.array): -# gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) -# th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) -# img_bin = ~img_bin -# -# line_min_width = 4 -# kernel_h = np.ones((1, line_min_width), np.uint8) -# kernel_v = np.ones((line_min_width, 1), np.uint8) -# -# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) -# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) -# -# img_bin_final = img_bin_h | img_bin_v -# -# _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) -# -# return labels, stats -# -# def annotate_image(image, stats): -# for x, y, w, h, area in stats[2:]: -# if w > 10 and h > 10: -# cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) -# -# for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): -# anno = f"{s} = {v}" -# xann = int(x + 5) -# yann = int(y + h - (20 * (i + 1))) -# cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) -# -# return image def annotate_tables_in_pdf(pdf_path, page_index=1): @@ -90,7 +60,7 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): #annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4) #annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf", 8) -annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Quartz sand_RAR_13_Volume_3CP_Repentol6PA_B2_2021_03_24.pdf", 6) +#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Quartz sand_RAR_13_Volume_3CP_Repentol6PA_B2_2021_03_24.pdf", 6) # annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 24) # annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 5) -#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 16) \ No newline at end of file +annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/Dokument6.pdf", 0) \ No newline at end of file From cf5851b652cc443a5552de08e04b76538ce0cc9a Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 26 Jan 2022 11:50:13 +0100 Subject: [PATCH 04/19] changes for repush --- .gitignore | 2 ++ table_parsing/table_parsig.py | 7 +------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index e399835..a369ba5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ /results/before/morph_operator_open.png /env/ +/.idea/ /home/lillian/table_parsing/.idea/ /.idea/.gitignore /.idea/misc.xml @@ -8,3 +9,4 @@ /.idea/vcs.xml /pdfs/ /results/ +/.idea/modules.xml diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index e90b8b9..d00eafd 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -41,6 +41,7 @@ def annotate_image(image, stats): return image + def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) @@ -58,9 +59,3 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): plt.show() -#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4) -#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf", 8) -#annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Quartz sand_RAR_13_Volume_3CP_Repentol6PA_B2_2021_03_24.pdf", 6) -# annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 24) -# annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/pdfs_efsa/Sulphur_RAR_18_Volume_3CP_SULFUR 80_ WG_B-8_2021-04-09.pdf", 5) -annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs/Dokument6.pdf", 0) \ No newline at end of file From edf3bfe4461076a4e0e415ebbf389f0df83ee564 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 27 Jan 2022 00:19:39 +0100 Subject: [PATCH 05/19] seperate function which is filtering for isolated boxes --- table_parsing/table_parsig.py | 40 ++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index d00eafd..7b486cf 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -4,11 +4,11 @@ import cv2 import numpy as np import pdf2image from matplotlib import pyplot as plt - +from timeit import timeit def parse(image: np.array): gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) + th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) img_bin = ~img_bin line_min_width = 5 @@ -25,12 +25,39 @@ def parse(image: np.array): return labels, stats +# def filter_unconnected_cells(stats): +# filtered_cells = [] +# for i, val in enumerate(stats[2:]): +# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] +# if w > 35 and h > 13 and area > 500: +# # print(stats[i]) +# if y == stats[i - 1][1] or y == stats[i + 1][1]: +# filtered_cells.append(stats[i]) +# return filtered_cells +# +# +# +# def annotate_image(image, stats): +# stats = filter_unconnected_cells(stats) +# for i,val in enumerate(stats): +# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] +# cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) +# +# for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): +# anno = f"{s} = {v}" +# xann = int(x + 5) +# yann = int(y + h - (20 * (i + 1))) +# cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) +# +# return image + def annotate_image(image, stats): + print(stats.shape) for i in range(2, len(stats)): - x,y,w,h,area = stats[i][0],stats[i][1],stats[i][2],stats[i][3],stats[i][4] + x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] if w > 35 and h > 13 and area > 500: - #print(stats[i]) - if y == stats[i-1][1] or y == stats[i+1][1]: + # print(stats[i]) + if y == stats[i - 1][1] or y == stats[i + 1][1]: cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): @@ -48,14 +75,17 @@ def parse_tables_in_pdf(pages): def annotate_tables_in_pdf(pdf_path, page_index=1): + timeit() page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) _, stats = parse(page) page = annotate_image(page, stats) + print(timeit()) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) plt.show() + From a68f89af0320833deff2da27b9f882aaac4c8b6e Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 27 Jan 2022 00:20:16 +0100 Subject: [PATCH 06/19] added files to git ignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index a369ba5..7835a56 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,7 @@ /pdfs/ /results/ /.idea/modules.xml +/table_parsing.egg-info/dependency_links.txt +/table_parsing.egg-info/PKG-INFO +/table_parsing.egg-info/SOURCES.txt +/table_parsing.egg-info/top_level.txt From bb5083d419aa6abee7648051f6dd1017627d0fa2 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Tue, 1 Feb 2022 19:25:05 +0100 Subject: [PATCH 07/19] added function for detecting external edges --- table_parsing/table_parsig.py | 83 +++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 7b486cf..9d21c14 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -6,6 +6,7 @@ import pdf2image from matplotlib import pyplot as plt from timeit import timeit + def parse(image: np.array): gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) @@ -17,26 +18,40 @@ def parse(image: np.array): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + # print([cv2.countNonZero(row) for row in img_bin_v]) img_bin_final = img_bin_h | img_bin_v - _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + find_and_close_edges(img_bin_final) + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) return labels, stats # def filter_unconnected_cells(stats): # filtered_cells = [] -# for i, val in enumerate(stats[2:]): -# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] +# for left, middle, right in zip(stats[0:], stats[1:], list(stats[2:])+[None]): +# x, y, w, h, area = middle # if w > 35 and h > 13 and area > 500: -# # print(stats[i]) -# if y == stats[i - 1][1] or y == stats[i + 1][1]: -# filtered_cells.append(stats[i]) +# if y == left[1] or y == right[1]: +# filtered_cells.append(middle) # return filtered_cells -# -# -# + +def filter_unconnected_cells(stats): + filtered_cells = [] + # print(stats) + for left, middle, right in zip(stats[0:], stats[1:], list(stats[2:]) + [np.array([None, None, None, None, None])]): + x, y, w, h, area = middle + if w > 35 and h > 13 and area > 500: + if right[1] is None: + if y == left[1] or x == left[0]: + filtered_cells.append(middle) + else: + if y == left[1] or y == right[1] or x == left[0] or x == right[0]: + filtered_cells.append(middle) + return filtered_cells + + # def annotate_image(image, stats): # stats = filter_unconnected_cells(stats) # for i,val in enumerate(stats): @@ -50,30 +65,47 @@ def parse(image: np.array): # cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) # # return image - def annotate_image(image, stats): - print(stats.shape) - for i in range(2, len(stats)): - x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] - if w > 35 and h > 13 and area > 500: - # print(stats[i]) - if y == stats[i - 1][1] or y == stats[i + 1][1]: - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) - - for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + stats = filter_unconnected_cells(stats) + for stat in stats: + x, y, w, h, area = stat + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) return image +def find_and_close_edges(img_bin_final): + contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + # contoured_img = cv2.drawContours(img_bin_final,contours, -1,(255,255,255),2) + for cnt in contours: + missing_external_edges = True + left = tuple(cnt[cnt[:, :, 0].argmin()][0]) + right = tuple(cnt[cnt[:, :, 0].argmax()][0]) + top = tuple(cnt[cnt[:, :, 1].argmin()][0]) + bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) + topleft = [left[0] + 1, top[1]] + bottomright = [right[0] - 1, bottom[1]] + + for arr in cnt: + if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): + missing_external_edges = False + break + + if missing_external_edges: + cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) + + return img_bin_final + + def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) - def annotate_tables_in_pdf(pdf_path, page_index=1): timeit() page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] @@ -86,6 +118,3 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): fig.set_size_inches(20, 20) ax.imshow(page) plt.show() - - - From 5abc80aced48e08bc5cf09b2de371b34d136c4f4 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 2 Feb 2022 11:25:10 +0100 Subject: [PATCH 08/19] corrected function for detecting external edges --- table_parsing/table_parsig.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 9d21c14..73a01bb 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -81,7 +81,6 @@ def annotate_image(image, stats): def find_and_close_edges(img_bin_final): contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - # contoured_img = cv2.drawContours(img_bin_final,contours, -1,(255,255,255),2) for cnt in contours: missing_external_edges = True left = tuple(cnt[cnt[:, :, 0].argmin()][0]) @@ -89,15 +88,18 @@ def find_and_close_edges(img_bin_final): top = tuple(cnt[cnt[:, :, 1].argmin()][0]) bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) topleft = [left[0] + 1, top[1]] + # print(cnt, left, top, topleft) bottomright = [right[0] - 1, bottom[1]] - for arr in cnt: if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): missing_external_edges = False break - if missing_external_edges: + if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1])>= 50000: + topleft[0] -= 1 + bottomright[0] += 1 cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) + print("missing cell detectet rectangle drawn") return img_bin_final From cf29204a9e50200064ec235016aa97f7ce4847a1 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 2 Feb 2022 16:35:35 +0100 Subject: [PATCH 09/19] corrected function for detecting external edges --- table_parsing/table_parsig.py | 47 +++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 73a01bb..1efe198 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -18,7 +18,7 @@ def parse(image: np.array): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - # print([cv2.countNonZero(row) for row in img_bin_v]) + #print(np.nonzero(img_bin_v)) img_bin_final = img_bin_h | img_bin_v @@ -79,43 +79,70 @@ def annotate_image(image, stats): return image +# def find_and_close_edges(img_bin_final): +# contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) +# +# for cnt in contours: +# missing_external_edges = True +# left = tuple(cnt[cnt[:, :, 0].argmin()][0]) +# right = tuple(cnt[cnt[:, :, 0].argmax()][0]) +# top = tuple(cnt[cnt[:, :, 1].argmin()][0]) +# bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) +# topleft = [left[0] + 1, top[1]] +# # print(cnt, left, top, topleft) +# bottomright = [right[0] - 1, bottom[1]] +# for arr in cnt: +# if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): +# missing_external_edges = False +# break +# +# if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1]) >= 50000: +# topleft[0] -= 1 +# bottomright[0] += 1 +# cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) +# #print("missing cell detectet rectangle drawn") +# +# return img_bin_final + def find_and_close_edges(img_bin_final): - contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for cnt in contours: missing_external_edges = True left = tuple(cnt[cnt[:, :, 0].argmin()][0]) right = tuple(cnt[cnt[:, :, 0].argmax()][0]) top = tuple(cnt[cnt[:, :, 1].argmin()][0]) bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) - topleft = [left[0] + 1, top[1]] + topleft = [left[0], top[1]] # print(cnt, left, top, topleft) - bottomright = [right[0] - 1, bottom[1]] + bottomright = [right[0], bottom[1]] for arr in cnt: if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): missing_external_edges = False break - if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1])>= 50000: - topleft[0] -= 1 - bottomright[0] += 1 + if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1]) >= 50000: cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) - print("missing cell detectet rectangle drawn") + #print("missing cell detectet rectangle drawn") return img_bin_final +def find_and_close_internal_gaps(img_bin_final): + contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) def annotate_tables_in_pdf(pdf_path, page_index=1): - timeit() + #timeit() page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) _, stats = parse(page) page = annotate_image(page, stats) - print(timeit()) + #print(timeit()) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) From 9a065a0e7f62823a3b18e301d12c80b1a74f0b3e Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 3 Feb 2022 16:45:09 +0100 Subject: [PATCH 10/19] Made Bob proud --- table_parsing/table_parsig.py | 43 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index c6d0306..5090721 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -4,42 +4,41 @@ import cv2 import numpy as np import pdf2image from matplotlib import pyplot as plt +import imutils def parse(image: np.array): gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) - img_bin = ~img_bin + blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) + thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] + img_bin = ~thresh - line_min_width = 4 - kernel_h = np.ones((1, line_min_width), np.uint8) - kernel_v = np.ones((line_min_width, 1), np.uint8) + line_min_width = 10 + kernel_h = np.ones((10, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 10), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) img_bin_final = img_bin_h | img_bin_v - - _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) - - return labels, stats + + contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = imutils.grab_contours(contours) + for c in contours: + peri = cv2.arcLength(c, True) + approx = cv2.approxPolyDP(c, 0.04 * peri, True) + yield cv2.boundingRect(approx) def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) -def annotate_image(image, stats): - for x, y, w, h, area in stats[2:]: - if w > 10 and h > 10: - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) - - for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) +def annotate_boxes(image, rects): + for rect in rects: + (x, y, w, h) = rect + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) return image @@ -49,10 +48,10 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) - _, stats = parse(page) - page = annotate_image(page, stats) + asd = parse(page) + page = annotate_boxes(page, asd) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) - plt.show() + plt.show() \ No newline at end of file From aed7f27626d7ae0a42b0542f9dcfe25bf1f2b9f8 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 3 Feb 2022 17:01:57 +0100 Subject: [PATCH 11/19] new changes for table completion --- table_parsing/table_parsig.py | 76 +++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 1efe198..dce5f9f 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -18,15 +18,38 @@ def parse(image: np.array): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - #print(np.nonzero(img_bin_v)) - + # find_and_close_internal_gaps(img_bin_v) img_bin_final = img_bin_h | img_bin_v - - find_and_close_edges(img_bin_final) + plt.imshow(img_bin_final) + #find_and_close_internal_gaps(img_bin_final) + #find_and_close_edges(img_bin_final) _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) return labels, stats +# def parse(image: np.array): +# gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) +# th1, img_bin = cv2.threshold(gray_scale, 250, 255, cv2.THRESH_BINARY) +# img_bin = ~img_bin +# +# line_min_width = 10 +# kernel_h = np.ones((20, line_min_width), np.uint8) +# #kernel_v = np.ones((line_min_width, 20), np.uint8) +# +# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) +# #img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) +# #img_bin_final = img_bin_h | img_bin_v +# contours, hierarchy = cv2.findContours(img_bin_h, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) +# cv2.drawContours(img_bin_h, contours, 1, (255,0,0) , 6) +# plt.imshow(img_bin_h) +# print([cnt for cnt in contours if len(cnt)==4]) +# #plt.imshow(img_bin_h) +# #find_and_close_internal_gaps(img_bin_final) +# #find_and_close_edges(img_bin_final) +# +# #_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) +# #return labels, stats +# return contours,hierarchy # def filter_unconnected_cells(stats): # filtered_cells = [] @@ -52,19 +75,7 @@ def filter_unconnected_cells(stats): return filtered_cells -# def annotate_image(image, stats): -# stats = filter_unconnected_cells(stats) -# for i,val in enumerate(stats): -# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] -# cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) -# -# for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): -# anno = f"{s} = {v}" -# xann = int(x + 5) -# yann = int(y + h - (20 * (i + 1))) -# cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) -# -# return image + def annotate_image(image, stats): stats = filter_unconnected_cells(stats) for stat in stats: @@ -114,21 +125,36 @@ def find_and_close_edges(img_bin_final): top = tuple(cnt[cnt[:, :, 1].argmin()][0]) bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) topleft = [left[0], top[1]] - # print(cnt, left, top, topleft) bottomright = [right[0], bottom[1]] for arr in cnt: if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): missing_external_edges = False break - if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1]) >= 50000: - cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) - #print("missing cell detectet rectangle drawn") + if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000: + cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2) + # print("missing cell detectet rectangle drawn") return img_bin_final -def find_and_close_internal_gaps(img_bin_final): - contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + +def find_and_close_internal_gaps(img_bin): + contours, hierarchy = cv2.findContours(img_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(img_bin, contours, -1,(255,255,255),2) + plt.imshow(img_bin) + #print([cnt for cnt in contours if len(cnt) == 2]) + # + # print(contours) + # contours_list = sorted([cnt.tolist() for cnt in contours if len(cnt)>2]) + # lines_with_gaps = [] + # for left, right in zip(contours_list[0:], contours_list[1:] + [[[[None]]]]): + # print(left, left[0], left[0][0]) + # if left[1][0][1]-left[0][0][1] > 13: + # if left[0][0][0] == right[0][0][0]: + # lines_with_gaps.append(left + right) + # for lines in lines_with_gaps: + # cv2.line(img_bin, tuple(min(lines)[0]), tuple(max(lines)[0]), (255,255,255), 2) + # #plt.imshow(img_bin) def parse_tables_in_pdf(pages): @@ -136,13 +162,13 @@ def parse_tables_in_pdf(pages): def annotate_tables_in_pdf(pdf_path, page_index=1): - #timeit() + # timeit() page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) _, stats = parse(page) page = annotate_image(page, stats) - #print(timeit()) + # print(timeit()) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) From 6274c204a9c12ac38e39772e4e2a47681590d224 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 4 Feb 2022 10:11:14 +0100 Subject: [PATCH 12/19] added imutils to requirements few changes in table_parsig.py bc of a pull --- requirements.txt | 1 + table_parsing/table_parsig.py | 89 +++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index c607322..2be77e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ opencv-python~=4.5.5.62 numpy~=1.22.1 pdf2image~=1.16.0 matplotlib~=3.5.1 +imutils~=0.5.4 \ No newline at end of file diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index ebae9c8..16db709 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -1,14 +1,13 @@ from itertools import count import cv2 +import imutils import numpy as np import pdf2image from matplotlib import pyplot as plt -import imutils def parse(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] @@ -30,10 +29,93 @@ def parse(image: np.array): approx = cv2.approxPolyDP(c, 0.04 * peri, True) yield cv2.boundingRect(approx) +def parse_tables(image: np.array, rectangle): + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + line_min_width = 5 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + # find_and_close_internal_gaps(img_bin_v) + img_bin_final = img_bin_h | img_bin_v + plt.imshow(img_bin_final) + # find_and_close_internal_gaps(img_bin_final) + # find_and_close_edges(img_bin_final) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + return labels, stats + + +def filter_unconnected_cells(stats): + filtered_cells = [] + # print(stats) + for left, middle, right in zip(stats[0:], stats[1:], + list(stats[2:]) + [np.array([None, None, None, None, None])]): + x, y, w, h, area = middle + if w > 35 and h > 13 and area > 500: + if right[1] is None: + if y == left[1] or x == left[0]: + filtered_cells.append(middle) + else: + if y == left[1] or y == right[1] or x == left[0] or x == right[0]: + filtered_cells.append(middle) + return filtered_cells + +def find_and_close_edges(img_bin_final): + contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + for cnt in contours: + missing_external_edges = True + left = tuple(cnt[cnt[:, :, 0].argmin()][0]) + right = tuple(cnt[cnt[:, :, 0].argmax()][0]) + top = tuple(cnt[cnt[:, :, 1].argmin()][0]) + bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) + topleft = [left[0], top[1]] + bottomright = [right[0], bottom[1]] + for arr in cnt: + if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): + missing_external_edges = False + break + + if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000: + cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2) + # print("missing cell detectet rectangle drawn") + + return img_bin_final + +def annotate_image(image, stats): + stats = filter_unconnected_cells(stats) + for stat in stats: + x, y, w, h, area = stat + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + + return image def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) +# def annotate_tables_in_pdf(pdf_path, page_index=1): +# # timeit() +# page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] +# page = np.array(page) +# +# _, stats = parse(page) +# page = annotate_image(page, stats) +# # print(timeit()) +# fig, ax = plt.subplots(1, 1) +# fig.set_size_inches(20, 20) +# ax.imshow(page) +# plt.show() + def annotate_boxes(image, rects): for rect in rects: @@ -44,7 +126,6 @@ def annotate_boxes(image, rects): def annotate_tables_in_pdf(pdf_path, page_index=1): - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) @@ -54,4 +135,4 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) - plt.show() \ No newline at end of file + plt.show() From 443163864bab56930c2ef735c0aaafddd2561ead Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sat, 5 Feb 2022 17:59:03 +0100 Subject: [PATCH 13/19] implememted clean solution for parsing open tables. still needs final refactoring. --- scripts/annotate.py | 2 +- vidocp/table_parsing_2.py | 74 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 vidocp/table_parsing_2.py diff --git a/scripts/annotate.py b/scripts/annotate.py index 4c6d7b8..95de313 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,6 +1,6 @@ import argparse -from vidocp.table_parsig import annotate_tables_in_pdf +from vidocp.table_parsing_2 import annotate_tables_in_pdf from vidocp.redaction_detection import annotate_boxes_in_pdf from vidocp.layout_detection import annotate_layout_in_pdf diff --git a/vidocp/table_parsing_2.py b/vidocp/table_parsing_2.py new file mode 100644 index 0000000..8b035bf --- /dev/null +++ b/vidocp/table_parsing_2.py @@ -0,0 +1,74 @@ +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from pdf2image import pdf2image + + +def add_external_contours(image, img): + + contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) + + return image + + +def isolate_vertical_and_horizontal_components(img_bin): + + line_min_width = 30 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + + img_bin_final = img_bin_h | img_bin_v + + return img_bin_final + + +def annotate_image(image, stats): + + image = image.copy() + + for x, y, w, h, area in stats[2:]: + if w > 10 and h > 10: + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + + return image + + +def parse_table(image: np.array): + + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + img_bin = isolate_vertical_and_horizontal_components(img_bin) + img_bin_final = add_external_contours(img_bin, img_bin) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + + return stats + + +def annotate_tables_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + stats = parse_table(page) + page = annotate_image(page, stats) + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(page) + plt.show() From b569b035721fded90ba754c914891e42502e0eae Mon Sep 17 00:00:00 2001 From: llocarnini Date: Sat, 5 Feb 2022 18:00:10 +0100 Subject: [PATCH 14/19] current status of trying to connect layout and table parsing --- table_parsing/table_parsig.py | 94 +++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 31 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 16db709..00a3d36 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -9,19 +9,22 @@ from matplotlib import pyplot as plt def parse(image: np.array): gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) - thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] + #plt.imshow(gray_scale) + blurred = cv2.GaussianBlur(gray_scale, (7, 7), 2) #5 5 1 + thresh = cv2.threshold(blurred, 251, 255, cv2.THRESH_BINARY)[1] + #plt.imshow(thresh) img_bin = ~thresh - line_min_width = 10 + line_min_width = 7 kernel_h = np.ones((10, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 10), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - + #plt.imshow(img_bin_h) + #plt.imshow(img_bin_v) img_bin_final = img_bin_h | img_bin_v - + plt.imshow(img_bin_final) contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = imutils.grab_contours(contours) for c in contours: @@ -29,25 +32,48 @@ def parse(image: np.array): approx = cv2.approxPolyDP(c, 0.04 * peri, True) yield cv2.boundingRect(approx) -def parse_tables(image: np.array, rectangle): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) - img_bin = ~img_bin +def parse_tables(image: np.array, rects: list): + parsed_tables = [] + for rect in rects: + (x,y,w,h) = rect + region_of_interest = image[x:x+w, y:y+h] + gray = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY) + thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)[1] + img_bin = ~thresh - line_min_width = 5 - kernel_h = np.ones((1, line_min_width), np.uint8) - kernel_v = np.ones((line_min_width, 1), np.uint8) + line_min_width = 5 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) - img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) - img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) # find_and_close_internal_gaps(img_bin_v) - img_bin_final = img_bin_h | img_bin_v - plt.imshow(img_bin_final) + img_bin_final = img_bin_h | img_bin_v + #plt.imshow(img_bin_final) # find_and_close_internal_gaps(img_bin_final) # find_and_close_edges(img_bin_final) - _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) - return labels, stats + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + parsed_tables.append([(x,y,w,h), stats]) + return parsed_tables + #yield (x,y,w,h), stats, region_of_interest + # return stats + +def annotate_table(image, parsed_tables): + for table in parsed_tables: + original_coordinates, stats = table + stats = filter_unconnected_cells(stats) + for stat in stats: + x, y, w, h, area = stat + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + + return image + def filter_unconnected_cells(stats): @@ -87,18 +113,7 @@ def find_and_close_edges(img_bin_final): return img_bin_final -def annotate_image(image, stats): - stats = filter_unconnected_cells(stats) - for stat in stats: - x, y, w, h, area = stat - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) - for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) - return image def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) @@ -118,19 +133,36 @@ def parse_tables_in_pdf(pages): def annotate_boxes(image, rects): + print(type(rects)) for rect in rects: (x, y, w, h) = rect cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) return image +def filter_tables_or_images(rects): + filtered = [] + for rect in rects: + (x,y,w,h) = rect + print(w*h) + if w * h > 10**6: + filtered.append(rect) + print(filtered) + return filtered + + + def annotate_tables_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) - asd = parse(page) - page = annotate_boxes(page, asd) + layout_boxes = parse(page) + page = annotate_boxes(page, layout_boxes) + parsed_tables = parse_tables(page, filter_tables_or_images(layout_boxes)) + page = annotate_table(page, parsed_tables) + + fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) From ee613f3e78aa7c62f8dbc80d9050520992b3a87e Mon Sep 17 00:00:00 2001 From: llocarnini Date: Tue, 8 Feb 2022 15:35:18 +0100 Subject: [PATCH 15/19] add to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7835a56..894f468 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ /table_parsing.egg-info/PKG-INFO /table_parsing.egg-info/SOURCES.txt /table_parsing.egg-info/top_level.txt +/data/ From b19a9e35c858e0a04342fb3dace72582b8ac6c7f Mon Sep 17 00:00:00 2001 From: llocarnini Date: Mon, 21 Feb 2022 14:58:58 +0100 Subject: [PATCH 16/19] added function to rotate pdfs which were scanned at an angle. Not working yet --- vidocp/table_parsing.py | 82 ++++++++++++++++++++++++++++++--- vidocp/utils/post_processing.py | 4 +- 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index c43a457..e05d78e 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -5,6 +5,7 @@ from operator import attrgetter import cv2 import numpy as np from pdf2image import pdf2image +from scipy import ndimage from vidocp.utils.display import show_mpl from vidocp.utils.draw import draw_rectangles @@ -23,6 +24,69 @@ def add_external_contours(image, img): return image +# def rotate_line_components(img_bin_h, img_bin_v): +# def get_avg_angle(img_bin): +# edges = cv2.Canny(img_bin, 50, 150, apertureSize=3) +# lines = cv2.HoughLines(edges, 1, np.pi / 180, 200) +# angles_deg = [] +# +# for line in lines: +# line = line[0] +# rho, theta = line +# angles_deg.append(180/np.pi * theta) +# +# avg_ang_deg = sum(angles_deg) / len(angles_deg) +# return average_angle_in_deg +# avg_ang_h = get_avg_angle(img_bin_h) +# avg_ang_v = get_avg_angle(img_bin_v) +# print(average_angle_in_deg, "angle before angle correction") +# # average_angle_in_deg = 90 - average_angle_in_deg +# # print(average_angle_in_deg) +# if average_angle_in_deg == 0.0 or average_angle_in_deg == 90.0: +# print("angle is 0 or 90") +# return img_bin +# elif average_angle_in_deg > 0: #linksrtoation im originalbild +# print("leftrot angle is not 0 or 90") +# average_angle_in_deg = 360 - average_angle_in_deg +# else: +# print("rightrot", average_angle_in_deg) +# print(average_angle_in_deg, "angle for Rotationsmatrix") +# img_bin_rotated = ndimage.rotate(img_bin, average_angle_in_deg, reshape=False) +# show_mpl(img_bin_rotated) +# +# return img_bin + +# def rotate_line_components(img_bin): +# height, width = img_bin.shape[:2] +# center = (width/2, height/2) +# edges = cv2.Canny(img_bin, 50, 150, apertureSize=3) +# lines = cv2.HoughLines(edges, 1, np.pi / 180, 200) +# angles_deg = [] +# +# for line in lines: +# line = line[0] +# rho, theta = line +# angles_deg.append(180/np.pi * theta) +# #linksdrehung < 90 rechtsdrehung > 90 +# average_angle_in_deg = sum(angles_deg) / len(angles_deg) +# print(average_angle_in_deg, "angle before angle correction") +# average_angle_in_deg = 90 - average_angle_in_deg +# print(average_angle_in_deg) +# if average_angle_in_deg == 0.0 or average_angle_in_deg == 90.0: +# print("angle is 0 or 90") +# return img_bin +# elif average_angle_in_deg > 0: #linksrtoation im originalbild +# print("leftrot angle is not 0 or 90") +# average_angle_in_deg = 360 - average_angle_in_deg +# else: +# print("rightrot", average_angle_in_deg) +# print(average_angle_in_deg, "angle for Rotationsmatrix") +# img_bin_rotated = ndimage.rotate(img_bin, average_angle_in_deg, reshape=False) +# show_mpl(img_bin_rotated) +# +# return img_bin + + def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): line_min_width = 48 @@ -33,25 +97,31 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) show_mpl(img_bin_h | img_bin_v) + # img_bin_h = rotate_line_components(img_bin_h) + # img_bin_v = rotate_line_components(img_bin_v) + #show_mpl(img_bin_h | img_bin_v) + kernel_h = np.ones((1, 30), np.uint8) kernel_v = np.ones((30, 1), np.uint8) img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2) img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2) - show_mpl(img_bin_h | img_bin_v) + #show_mpl(img_bin_h | img_bin_v) #reduced filtersize from 100 to 80 to minimize splitting narrow cells - img_bin_h = apply_motion_blur(img_bin_h, 80, 0) - img_bin_v = apply_motion_blur(img_bin_v, 80, 90) + img_bin_h = apply_motion_blur(img_bin_h, 90, 0) + img_bin_v = apply_motion_blur(img_bin_v, 90, 90) img_bin_final = img_bin_h | img_bin_v - show_mpl(img_bin_final) + #show_mpl(img_bin_final) + #changed threshold from 110 to 120 to minimize cell splitting th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY) img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1) - show_mpl(img_bin_final) + #show_mpl(img_bin_final) + # problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22 img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects) - show_mpl(img_bin_final) + #show_mpl(img_bin_final) return img_bin_final diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index a3a04b1..bf10793 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -43,7 +43,7 @@ def adjacent(a, b): """ def adjacent2d(g, h, i, j, k, l): - #print(adjacent1d(g, h) and any(k <= p <= l for p in [i, j])) + #print(adjacent1d(g, h), any(k <= p <= l for p in [i, j])) return adjacent1d(g, h) and any(k <= p <= l for p in [i, j]) if any(x is None for x in (a, b)): @@ -79,7 +79,7 @@ def __remove_isolated_unsorted(rectangles): def __remove_isolated_sorted(rectangles): def is_connected(left, center, right): - # print(left,center,right, list(starmap(adjacent, [(left, center), (center, right)]))) + #print(left,center,right) return any(starmap(adjacent, [(left, center), (center, right)])) rectangles = list(map(xywh_to_vec_rect, rectangles)) From 5b4636717597932745e5d5f1e10e24861e5cfdd4 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 25 Feb 2022 08:13:47 +0100 Subject: [PATCH 17/19] no new changes --- vidocp/table_parsing.py | 93 +++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 55 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index e05d78e..8d71baf 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -24,6 +24,7 @@ def add_external_contours(image, img): return image + # def rotate_line_components(img_bin_h, img_bin_v): # def get_avg_angle(img_bin): # edges = cv2.Canny(img_bin, 50, 150, apertureSize=3) @@ -33,59 +34,43 @@ def add_external_contours(image, img): # for line in lines: # line = line[0] # rho, theta = line -# angles_deg.append(180/np.pi * theta) +# angles_deg.append(180 / np.pi * theta) # # avg_ang_deg = sum(angles_deg) / len(angles_deg) -# return average_angle_in_deg +# return avg_ang_deg +# # avg_ang_h = get_avg_angle(img_bin_h) # avg_ang_v = get_avg_angle(img_bin_v) -# print(average_angle_in_deg, "angle before angle correction") -# # average_angle_in_deg = 90 - average_angle_in_deg -# # print(average_angle_in_deg) -# if average_angle_in_deg == 0.0 or average_angle_in_deg == 90.0: -# print("angle is 0 or 90") -# return img_bin -# elif average_angle_in_deg > 0: #linksrtoation im originalbild -# print("leftrot angle is not 0 or 90") -# average_angle_in_deg = 360 - average_angle_in_deg -# else: -# print("rightrot", average_angle_in_deg) -# print(average_angle_in_deg, "angle for Rotationsmatrix") -# img_bin_rotated = ndimage.rotate(img_bin, average_angle_in_deg, reshape=False) -# show_mpl(img_bin_rotated) +# print(avg_ang_h,avg_ang_v) # -# return img_bin - -# def rotate_line_components(img_bin): -# height, width = img_bin.shape[:2] -# center = (width/2, height/2) -# edges = cv2.Canny(img_bin, 50, 150, apertureSize=3) -# lines = cv2.HoughLines(edges, 1, np.pi / 180, 200) -# angles_deg = [] +# if avg_ang_h > avg_ang_v: # leftrotated scan +# rot_ang_h = 270 + avg_ang_h +# rot_ang_v = avg_ang_v +# elif avg_ang_h < avg_ang_v: # rightrotated scan +# rot_ang_h = 90 - avg_ang_h +# rot_ang_v = avg_ang_v +# print(rot_ang_h, rot_ang_v) +# img_bin_hr = ndimage.rotate(img_bin_h, rot_ang_h, reshape=False) +# img_bin_vr = ndimage.rotate(img_bin_v, rot_ang_v, reshape=False) +# #show_mpl(img_bin_hr) # -# for line in lines: -# line = line[0] -# rho, theta = line -# angles_deg.append(180/np.pi * theta) -# #linksdrehung < 90 rechtsdrehung > 90 -# average_angle_in_deg = sum(angles_deg) / len(angles_deg) -# print(average_angle_in_deg, "angle before angle correction") -# average_angle_in_deg = 90 - average_angle_in_deg -# print(average_angle_in_deg) -# if average_angle_in_deg == 0.0 or average_angle_in_deg == 90.0: -# print("angle is 0 or 90") -# return img_bin -# elif average_angle_in_deg > 0: #linksrtoation im originalbild -# print("leftrot angle is not 0 or 90") -# average_angle_in_deg = 360 - average_angle_in_deg -# else: -# print("rightrot", average_angle_in_deg) -# print(average_angle_in_deg, "angle for Rotationsmatrix") -# img_bin_rotated = ndimage.rotate(img_bin, average_angle_in_deg, reshape=False) -# show_mpl(img_bin_rotated) # -# return img_bin - +# # print(average_angle_in_deg, "angle before angle correction") +# # # average_angle_in_deg = 90 - average_angle_in_deg +# # # print(average_angle_in_deg) +# # if average_angle_in_deg == 0.0 or average_angle_in_deg == 90.0: +# # print("angle is 0 or 90") +# # return img_bin +# # elif average_angle_in_deg > 0: #linksrtoation im originalbild +# # print("leftrot angle is not 0 or 90") +# # average_angle_in_deg = 360 - average_angle_in_deg +# # else: +# # print("rightrot", average_angle_in_deg) +# # print(average_angle_in_deg, "angle for Rotationsmatrix") +# # img_bin_rotated = ndimage.rotate(img_bin, average_angle_in_deg, reshape=False) +# # show_mpl(img_bin_rotated) +# +# return img_bin_hr, img_bin_vr def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): @@ -97,31 +82,29 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) show_mpl(img_bin_h | img_bin_v) - # img_bin_h = rotate_line_components(img_bin_h) - # img_bin_v = rotate_line_components(img_bin_v) - #show_mpl(img_bin_h | img_bin_v) + # img_bin_h, img_bin_v = rotate_line_components(img_bin_h,img_bin_v) kernel_h = np.ones((1, 30), np.uint8) kernel_v = np.ones((30, 1), np.uint8) img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2) img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2) - #show_mpl(img_bin_h | img_bin_v) + # show_mpl(img_bin_h | img_bin_v) - #reduced filtersize from 100 to 80 to minimize splitting narrow cells + # reduced filtersize from 100 to 80 to minimize splitting narrow cells img_bin_h = apply_motion_blur(img_bin_h, 90, 0) img_bin_v = apply_motion_blur(img_bin_v, 90, 90) img_bin_final = img_bin_h | img_bin_v - #show_mpl(img_bin_final) + # show_mpl(img_bin_final) - #changed threshold from 110 to 120 to minimize cell splitting + # changed threshold from 110 to 120 to minimize cell splitting th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY) img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1) - #show_mpl(img_bin_final) + # show_mpl(img_bin_final) # problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22 img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects) - #show_mpl(img_bin_final) + # show_mpl(img_bin_final) return img_bin_final From dcab29becbb28a8e78fcd8dd39735d5f99b130e3 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 25 Feb 2022 08:59:19 +0100 Subject: [PATCH 18/19] deleted not needed function --- vidocp/table_parsing.py | 50 ----------------------------------------- 1 file changed, 50 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 8d71baf..155cc15 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -5,7 +5,6 @@ from operator import attrgetter import cv2 import numpy as np from pdf2image import pdf2image -from scipy import ndimage from vidocp.utils.display import show_mpl from vidocp.utils.draw import draw_rectangles @@ -25,53 +24,6 @@ def add_external_contours(image, img): return image -# def rotate_line_components(img_bin_h, img_bin_v): -# def get_avg_angle(img_bin): -# edges = cv2.Canny(img_bin, 50, 150, apertureSize=3) -# lines = cv2.HoughLines(edges, 1, np.pi / 180, 200) -# angles_deg = [] -# -# for line in lines: -# line = line[0] -# rho, theta = line -# angles_deg.append(180 / np.pi * theta) -# -# avg_ang_deg = sum(angles_deg) / len(angles_deg) -# return avg_ang_deg -# -# avg_ang_h = get_avg_angle(img_bin_h) -# avg_ang_v = get_avg_angle(img_bin_v) -# print(avg_ang_h,avg_ang_v) -# -# if avg_ang_h > avg_ang_v: # leftrotated scan -# rot_ang_h = 270 + avg_ang_h -# rot_ang_v = avg_ang_v -# elif avg_ang_h < avg_ang_v: # rightrotated scan -# rot_ang_h = 90 - avg_ang_h -# rot_ang_v = avg_ang_v -# print(rot_ang_h, rot_ang_v) -# img_bin_hr = ndimage.rotate(img_bin_h, rot_ang_h, reshape=False) -# img_bin_vr = ndimage.rotate(img_bin_v, rot_ang_v, reshape=False) -# #show_mpl(img_bin_hr) -# -# -# # print(average_angle_in_deg, "angle before angle correction") -# # # average_angle_in_deg = 90 - average_angle_in_deg -# # # print(average_angle_in_deg) -# # if average_angle_in_deg == 0.0 or average_angle_in_deg == 90.0: -# # print("angle is 0 or 90") -# # return img_bin -# # elif average_angle_in_deg > 0: #linksrtoation im originalbild -# # print("leftrot angle is not 0 or 90") -# # average_angle_in_deg = 360 - average_angle_in_deg -# # else: -# # print("rightrot", average_angle_in_deg) -# # print(average_angle_in_deg, "angle for Rotationsmatrix") -# # img_bin_rotated = ndimage.rotate(img_bin, average_angle_in_deg, reshape=False) -# # show_mpl(img_bin_rotated) -# -# return img_bin_hr, img_bin_vr - def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): line_min_width = 48 @@ -82,8 +34,6 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) show_mpl(img_bin_h | img_bin_v) - # img_bin_h, img_bin_v = rotate_line_components(img_bin_h,img_bin_v) - kernel_h = np.ones((1, 30), np.uint8) kernel_v = np.ones((30, 1), np.uint8) img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2) From 496957051cd8515b4aff7ff3e754bbcb0f240483 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Mon, 28 Feb 2022 16:12:30 +0100 Subject: [PATCH 19/19] added two tests for table_parsing.py -testing number of parsed rectangles -testing range of table coordinates (where to find a table) --- .gitignore | 7 ++++++- tests/test_table_parsing.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 tests/test_table_parsing.py diff --git a/.gitignore b/.gitignore index bfbacf3..393863e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,9 @@ /.idea/vcs.xml /results/ /data -/table_parsing.egg-info \ No newline at end of file +/table_parsing.egg-info +/tests/VV-313450.pdf +/vidocp.egg-info/dependency_links.txt +/vidocp.egg-info/PKG-INFO +/vidocp.egg-info/SOURCES.txt +/vidocp.egg-info/top_level.txt diff --git a/tests/test_table_parsing.py b/tests/test_table_parsing.py new file mode 100644 index 0000000..c882d2d --- /dev/null +++ b/tests/test_table_parsing.py @@ -0,0 +1,29 @@ +import pytest +from vidocp.table_parsing import parse_table +import numpy as np +import pdf2image + + +@pytest.fixture() +def rects(): + page_index = 0 + pdf_path = "/home/lillian/vidocp/tests/VV-313450.pdf" + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + rectangles = parse_table(page) + return rectangles + + +def test_num_of_rects(rects): + assert len(rects) == 49 + + +def test_range_of_rects(rects): + expected_range = ((210, 605), (1430, 1620)) + topleft = min(rects) + x,y,w,h = max(rects) + bottomright = (x+w, y+h) + + assert topleft >= expected_range[0] + assert bottomright <= expected_range[1] +