Pull request #5: Table parsing version 2
Merge in RR/vidocp from table_parsing_version_2 to master
Squashed commit of the following:
commit af136ca10cf96f99699e409000ff598ce90c192e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:34:01 2022 +0100
readme updated
commit 13ca7b1b03cb2bf7b3c8ef5821c1f8fa9ec532a0
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:32:11 2022 +0100
drawing color standardized
commit 654e961c62ddc0f512074e8238d7fa88f0ea227e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:22:57 2022 +0100
refactoring
commit 964c17a36f7bbc1376dfe68f4ea90462d676e215
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:07:16 2022 +0100
readme updated
commit 4470969b35bb76e68cc41947fa02e63100b30ce9
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:05:35 2022 +0100
readme updated
commit a6c6bdb1e71a778a3c21a628cfb30acc5bc6086f
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:05:21 2022 +0100
readme updated
commit e178793dd69b720adefe7533312314e4c405f975
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 18:03:45 2022 +0100
readme updated
commit 443163864bab56930c2ef735c0aaafddd2561ead
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 17:59:03 2022 +0100
implememted clean solution for parsing open tables. still needs final refactoring.
This commit is contained in:
parent
224360c823
commit
00748a8ac0
16
README.md
16
README.md
@ -23,7 +23,11 @@ dvc pull
|
||||
### As an API
|
||||
|
||||
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
|
||||
the specific task. Example for finding the outlines of previous redactions.
|
||||
the specific task.
|
||||
|
||||
#### Redaction Detection
|
||||
|
||||
The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
|
||||
@ -31,10 +35,10 @@ from vidocp.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
@ -52,13 +56,17 @@ Core API functionalities can be used through a CLI.
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
||||
```
|
||||
|
||||
The below image shows a parsed table, where each table cell has been detected individually.
|
||||
|
||||

|
||||
|
||||
|
||||
#### Redaction Detection
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
|
||||
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
||||
```bash
|
||||
python scripts/annotate.py <path to pdf> 0 --type redaction
|
||||
```
|
||||
|
||||
BIN
data/table_parsing.png
Normal file
BIN
data/table_parsing.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 566 KiB |
@ -1,6 +1,6 @@
|
||||
import argparse
|
||||
|
||||
from vidocp.table_parsig import annotate_tables_in_pdf
|
||||
from vidocp.table_parsing import annotate_tables_in_pdf
|
||||
from vidocp.redaction_detection import annotate_boxes_in_pdf
|
||||
from vidocp.layout_detection import annotate_layout_in_pdf
|
||||
|
||||
|
||||
@ -4,7 +4,8 @@ import cv2
|
||||
import numpy as np
|
||||
import pdf2image
|
||||
from iteration_utilities import starfilter, first
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from vidocp.utils import show_mpl, draw_contours
|
||||
|
||||
|
||||
def is_filled(hierarchy):
|
||||
@ -42,22 +43,12 @@ def find_redactions(image: np.array, min_normalized_area=200000):
|
||||
return contours
|
||||
|
||||
|
||||
def annotate_poly(image, contours):
|
||||
for cont in contours:
|
||||
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def annotate_boxes_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
page = annotate_poly(page, redaction_contours)
|
||||
page = draw_contours(page, redaction_contours)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
show_mpl(page)
|
||||
|
||||
@ -1,58 +0,0 @@
|
||||
from itertools import count
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pdf2image
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
|
||||
def parse(image: np.array):
|
||||
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
|
||||
line_min_width = 4
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
|
||||
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
return labels, stats
|
||||
|
||||
|
||||
def parse_tables_in_pdf(pages):
|
||||
return zip(map(parse, pages), count())
|
||||
|
||||
|
||||
def annotate_image(image, stats):
|
||||
for x, y, w, h, area in stats[2:]:
|
||||
if w > 10 and h > 10:
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
|
||||
|
||||
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
|
||||
anno = f"{s} = {v}"
|
||||
xann = int(x + 5)
|
||||
yann = int(y + h - (20 * (i + 1)))
|
||||
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
_, stats = parse(page)
|
||||
page = annotate_image(page, stats)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
55
vidocp/table_parsing.py
Normal file
55
vidocp/table_parsing.py
Normal file
@ -0,0 +1,55 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pdf2image import pdf2image
|
||||
|
||||
from vidocp.utils import show_cv2, draw_stats
|
||||
|
||||
|
||||
def add_external_contours(image, img):
|
||||
|
||||
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
|
||||
for cnt in contours:
|
||||
x, y, w, h = cv2.boundingRect(cnt)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def isolate_vertical_and_horizontal_components(img_bin):
|
||||
|
||||
line_min_width = 30
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
|
||||
return img_bin_final
|
||||
|
||||
|
||||
def parse_table(image: np.array):
|
||||
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
|
||||
img_bin = isolate_vertical_and_horizontal_components(img_bin)
|
||||
img_bin_final = add_external_contours(img_bin, img_bin)
|
||||
|
||||
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
stats = parse_table(page)
|
||||
page = draw_stats(page, stats)
|
||||
|
||||
show_cv2(page)
|
||||
54
vidocp/utils.py
Normal file
54
vidocp/utils.py
Normal file
@ -0,0 +1,54 @@
|
||||
import cv2
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
|
||||
def show_mpl(image):
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(image)
|
||||
plt.show()
|
||||
|
||||
|
||||
def show_cv2(image):
|
||||
|
||||
cv2.imshow("", image)
|
||||
cv2.waitKey(0)
|
||||
|
||||
|
||||
def draw_contours(image, contours):
|
||||
|
||||
image = image.copy()
|
||||
|
||||
for cont in contours:
|
||||
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def draw_stats(image, stats, annotate=False):
|
||||
|
||||
image = image.copy()
|
||||
keys = ["x", "y", "w", "h"]
|
||||
|
||||
def annotate_stat(x, y, w, h):
|
||||
|
||||
for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
|
||||
anno = f"{s} = {v}"
|
||||
xann = int(x + 5)
|
||||
yann = int(y + h - (20 * (i + 1)))
|
||||
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
|
||||
|
||||
def draw_stat(stat):
|
||||
|
||||
x, y, w, h, area = stat
|
||||
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
||||
|
||||
if annotate:
|
||||
annotate_stat(x, y, w, h)
|
||||
|
||||
for stat in stats[2:]:
|
||||
draw_stat(stat)
|
||||
|
||||
return image
|
||||
Loading…
x
Reference in New Issue
Block a user