changes of parameters in table parsing: l15 line_min_width = 5 so no cell is missing in tables, l37 bigger min. rectangle so no text will be detected as table
This commit is contained in:
parent
25ddbeac19
commit
1cf8508dc3
19
.gitignore
vendored
Normal file
19
.gitignore
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
/pdfs_syngenta/2f9bd062b382f3820a43caa993d94bb5.pdf
|
||||
/pdfs_syngenta/8dc4a4bf9c439eb402adfa2c53ce5c0c.pdf
|
||||
/pdfs_syngenta/23c935f9bf704395a214ddd22af45932.pdf
|
||||
/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf
|
||||
/pdfs_syngenta/77cc1f61872bf7e5d65836e24449fa35.pdf
|
||||
/pdfs_syngenta/1797686d2dc44e2e123877eddb5ee00b.pdf
|
||||
/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf
|
||||
/pdfs_syngenta/a9648c5ef39fb6a5596d9496349452b4.pdf
|
||||
/pdfs_syngenta/bbf150588d2d5c213d358fe24179c71a.pdf
|
||||
/pdfs_syngenta/bfd3cf846f724bd924bc3d148057f99e.pdf
|
||||
/results/morph_operator_close.png
|
||||
/results/morph_operator_open.png
|
||||
/env/
|
||||
/home/lillian/table_parsing/.idea/
|
||||
/.idea/.gitignore
|
||||
/.idea/misc.xml
|
||||
/.idea/inspectionProfiles/profiles_settings.xml
|
||||
/.idea/table_parsing.iml
|
||||
/.idea/vcs.xml
|
||||
@ -12,12 +12,14 @@ def parse(image: np.array):
|
||||
th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
|
||||
line_min_width = 4
|
||||
line_min_width = 5
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_h)
|
||||
# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_CLOSE, kernel_v)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
|
||||
@ -32,7 +34,7 @@ def parse_tables_in_pdf(pages):
|
||||
|
||||
def annotate_image(image, stats):
|
||||
for x, y, w, h, area in stats[2:]:
|
||||
if w > 10 and h > 10:
|
||||
if w > 35 and h > 13:
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
|
||||
|
||||
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
|
||||
@ -53,6 +55,10 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
page = annotate_image(page, stats)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
fig.set_size_inches(20 , 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
|
||||
|
||||
annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/026c917f04660aaea4bb57d180f9598b.pdf", 4)
|
||||
annotate_tables_in_pdf("/home/lillian/table_parsing/pdfs_syngenta/a8cf5a3c09552f3d868bec40f9aa49e5.pdf", 11)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user