Pull request #5: Table parsing version 2

Merge in RR/vidocp from table_parsing_version_2 to master

Squashed commit of the following:

commit af136ca10cf96f99699e409000ff598ce90c192e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:34:01 2022 +0100

    readme updated

commit 13ca7b1b03cb2bf7b3c8ef5821c1f8fa9ec532a0
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:32:11 2022 +0100

    drawing color standardized

commit 654e961c62ddc0f512074e8238d7fa88f0ea227e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:22:57 2022 +0100

    refactoring

commit 964c17a36f7bbc1376dfe68f4ea90462d676e215
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:07:16 2022 +0100

    readme updated

commit 4470969b35bb76e68cc41947fa02e63100b30ce9
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:05:35 2022 +0100

    readme updated

commit a6c6bdb1e71a778a3c21a628cfb30acc5bc6086f
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:05:21 2022 +0100

    readme updated

commit e178793dd69b720adefe7533312314e4c405f975
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 18:03:45 2022 +0100

    readme updated

commit 443163864bab56930c2ef735c0aaafddd2561ead
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 17:59:03 2022 +0100

    implememted clean solution for parsing open tables. still needs final refactoring.
This commit is contained in:
Matthias Bisping 2022-02-05 19:32:47 +01:00
parent 224360c823
commit 00748a8ac0
7 changed files with 126 additions and 76 deletions

View File

@ -23,7 +23,11 @@ dvc pull
### As an API
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
the specific task. Example for finding the outlines of previous redactions.
the specific task.
#### Redaction Detection
The below snippet shows hot to find the outlines of previous redactions.
```python
@ -31,10 +35,10 @@ from vidocp.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
@ -52,13 +56,17 @@ Core API functionalities can be used through a CLI.
The tables parsing utility detects and segments tables into individual cells.
```bash
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
python scripts/annotate.py data/test_pdf.pdf 7 --type table
```
The below image shows a parsed table, where each table cell has been detected individually.
![](data/table_parsing.png)
#### Redaction Detection
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
```bash
python scripts/annotate.py <path to pdf> 0 --type redaction
```

BIN
data/table_parsing.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 566 KiB

View File

@ -1,6 +1,6 @@
import argparse
from vidocp.table_parsig import annotate_tables_in_pdf
from vidocp.table_parsing import annotate_tables_in_pdf
from vidocp.redaction_detection import annotate_boxes_in_pdf
from vidocp.layout_detection import annotate_layout_in_pdf

View File

@ -4,7 +4,8 @@ import cv2
import numpy as np
import pdf2image
from iteration_utilities import starfilter, first
from matplotlib import pyplot as plt
from vidocp.utils import show_mpl, draw_contours
def is_filled(hierarchy):
@ -42,22 +43,12 @@ def find_redactions(image: np.array, min_normalized_area=200000):
return contours
def annotate_poly(image, contours):
for cont in contours:
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
return image
def annotate_boxes_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
page = annotate_poly(page, redaction_contours)
page = draw_contours(page, redaction_contours)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(page)
plt.show()
show_mpl(page)

View File

@ -1,58 +0,0 @@
from itertools import count
import cv2
import numpy as np
import pdf2image
from matplotlib import pyplot as plt
def parse(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
img_bin = ~img_bin
line_min_width = 4
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
return labels, stats
def parse_tables_in_pdf(pages):
return zip(map(parse, pages), count())
def annotate_image(image, stats):
for x, y, w, h, area in stats[2:]:
if w > 10 and h > 10:
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
anno = f"{s} = {v}"
xann = int(x + 5)
yann = int(y + h - (20 * (i + 1)))
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
return image
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
_, stats = parse(page)
page = annotate_image(page, stats)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(page)
plt.show()

55
vidocp/table_parsing.py Normal file
View File

@ -0,0 +1,55 @@
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils import show_cv2, draw_stats
def add_external_contours(image, img):
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin):
line_min_width = 30
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
return img_bin_final
def parse_table(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)
img_bin_final = add_external_contours(img_bin, img_bin)
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
return stats
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
stats = parse_table(page)
page = draw_stats(page, stats)
show_cv2(page)

54
vidocp/utils.py Normal file
View File

@ -0,0 +1,54 @@
import cv2
from matplotlib import pyplot as plt
def show_mpl(image):
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(image)
plt.show()
def show_cv2(image):
cv2.imshow("", image)
cv2.waitKey(0)
def draw_contours(image, contours):
image = image.copy()
for cont in contours:
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
return image
def draw_stats(image, stats, annotate=False):
image = image.copy()
keys = ["x", "y", "w", "h"]
def annotate_stat(x, y, w, h):
for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
anno = f"{s} = {v}"
xann = int(x + 5)
yann = int(y + h - (20 * (i + 1)))
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
def draw_stat(stat):
x, y, w, h, area = stat
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
if annotate:
annotate_stat(x, y, w, h)
for stat in stats[2:]:
draw_stat(stat)
return image