changes of parameters in table parsing: l15 line_min_width = 5 so no cell is missing in tables, l37 bigger min. rectangle so no text will be detected as table

This commit is contained in:
llocarnini 2022-01-24 16:55:29 +01:00
parent 25ddbeac19
commit 1cf8508dc3
2 changed files with 28 additions and 3 deletions

19
.gitignore vendored Normal file
View File

@ -0,0 +1,19 @@
/pdfs_syngenta/2f9bd062b382f3820a43caa993d94bb5.pdf
/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf
/pdfs_syngenta/23c935f9bf704395a214ddd22af45932.pdf
/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf
/pdfs_syngenta/77cc1f61872bf7e5d65836e24449fa35.pdf
/pdfs_syngenta/1797686d2dc44e2e123877eddb5ee00b.pdf
/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf
/pdfs_syngenta/a9648c5ef39fb6a5596d9496349452b4.pdf
/pdfs_syngenta/bbf150588d2d5c213d358fe24179c71a.pdf
/pdfs_syngenta/bfd3cf846f724bd924bc3d148057f99e.pdf
/results/morph_operator_close.png
/results/morph_operator_open.png
/env/
/home/lillian/table_parsing/.idea/
/.idea/.gitignore
/.idea/misc.xml
/.idea/inspectionProfiles/profiles_settings.xml
/.idea/table_parsing.iml
/.idea/vcs.xml

View File

@ -12,12 +12,14 @@ def parse(image: np.array):
th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
img_bin = ~img_bin
line_min_width = 4
line_min_width = 5
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_h)
# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_v)
img_bin_final = img_bin_h | img_bin_v
@ -32,7 +34,7 @@ def parse_tables_in_pdf(pages):
def annotate_image(image, stats):
for x, y, w, h, area in stats[2:]:
if w > 10 and h > 10:
if w > 35 and h > 13:
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
@ -53,6 +55,10 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
page = annotate_image(page, stats)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
fig.set_size_inches(20 , 20)
ax.imshow(page)
plt.show()
annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4)
annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf", 11)