diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e532dc9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +/pdfs_syngenta/2f9bd062b382f3820a43caa993d94bb5.pdf +/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf +/pdfs_syngenta/23c935f9bf704395a214ddd22af45932.pdf +/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf +/pdfs_syngenta/77cc1f61872bf7e5d65836e24449fa35.pdf +/pdfs_syngenta/1797686d2dc44e2e123877eddb5ee00b.pdf +/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf +/pdfs_syngenta/a9648c5ef39fb6a5596d9496349452b4.pdf +/pdfs_syngenta/bbf150588d2d5c213d358fe24179c71a.pdf +/pdfs_syngenta/bfd3cf846f724bd924bc3d148057f99e.pdf +/results/morph_operator_close.png +/results/morph_operator_open.png +/env/ +/home/lillian/table_parsing/.idea/ +/.idea/.gitignore +/.idea/misc.xml +/.idea/inspectionProfiles/profiles_settings.xml +/.idea/table_parsing.iml +/.idea/vcs.xml diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index c6d0306..8473d0d 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -12,12 +12,14 @@ def parse(image: np.array): th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) img_bin = ~img_bin - line_min_width = 4 + line_min_width = 5 kernel_h = np.ones((1, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 1), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + # img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_h) + # img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_v) img_bin_final = img_bin_h | img_bin_v @@ -32,7 +34,7 @@ def parse_tables_in_pdf(pages): def annotate_image(image, stats): for x, y, w, h, area in stats[2:]: - if w > 10 and h > 10: + if w > 35 and h > 13: cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): @@ -53,6 +55,10 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): page = annotate_image(page, stats) fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) + fig.set_size_inches(20 , 20) ax.imshow(page) plt.show() + + +annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4) +annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf", 11)