changes of parameters in table parsing: l15 line_min_width = 5 so no cell is missing in tables, l37 bigger min. rectangle so no text will be detected as table

2022-01-24 16:55:29 +01:00 · 2022-01-24 16:55:29 +01:00 · 1cf8508dc3
commit 1cf8508dc3
parent 25ddbeac19
2 changed files with 28 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,19 @@
+/pdfs_syngenta/2f9bd062b382f3820a43caa993d94bb5.pdf
+/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf
+/pdfs_syngenta/23c935f9bf704395a214ddd22af45932.pdf
+/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf
+/pdfs_syngenta/77cc1f61872bf7e5d65836e24449fa35.pdf
+/pdfs_syngenta/1797686d2dc44e2e123877eddb5ee00b.pdf
+/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf
+/pdfs_syngenta/a9648c5ef39fb6a5596d9496349452b4.pdf
+/pdfs_syngenta/bbf150588d2d5c213d358fe24179c71a.pdf
+/pdfs_syngenta/bfd3cf846f724bd924bc3d148057f99e.pdf
+/results/morph_operator_close.png
+/results/morph_operator_open.png
+/env/
+/home/lillian/table_parsing/.idea/
+/.idea/.gitignore
+/.idea/misc.xml
+/.idea/inspectionProfiles/profiles_settings.xml
+/.idea/table_parsing.iml
+/.idea/vcs.xml
--- a/table_parsing/table_parsig.py
+++ b/table_parsing/table_parsig.py
@ -12,12 +12,14 @@ def parse(image: np.array):
    th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
    img_bin = ~img_bin

-    line_min_width = 4
+    line_min_width = 5
    kernel_h = np.ones((1, line_min_width), np.uint8)
    kernel_v = np.ones((line_min_width, 1), np.uint8)

    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+    # img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_h)
+    # img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_v)

    img_bin_final = img_bin_h | img_bin_v

@ -32,7 +34,7 @@ def parse_tables_in_pdf(pages):

 def annotate_image(image, stats):
    for x, y, w, h, area in stats[2:]:
-        if w > 10 and h > 10:
+        if w > 35 and h > 13:
            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)

            for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
@ -53,6 +55,10 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
    page = annotate_image(page, stats)

    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
+    fig.set_size_inches(20 , 20)
    ax.imshow(page)
    plt.show()
+
+
+annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4)
+annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf", 11)