Pull request #5: Table parsing version 2

Merge in RR/vidocp from table_parsing_version_2 to master Squashed commit of the following: commit af136ca10cf96f99699e409000ff598ce90c192e Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:34:01 2022 +0100 readme updated commit 13ca7b1b03cb2bf7b3c8ef5821c1f8fa9ec532a0 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:32:11 2022 +0100 drawing color standardized commit 654e961c62ddc0f512074e8238d7fa88f0ea227e Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:22:57 2022 +0100 refactoring commit 964c17a36f7bbc1376dfe68f4ea90462d676e215 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:07:16 2022 +0100 readme updated commit 4470969b35bb76e68cc41947fa02e63100b30ce9 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:05:35 2022 +0100 readme updated commit a6c6bdb1e71a778a3c21a628cfb30acc5bc6086f Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:05:21 2022 +0100 readme updated commit e178793dd69b720adefe7533312314e4c405f975 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 18:03:45 2022 +0100 readme updated commit 443163864bab56930c2ef735c0aaafddd2561ead Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 17:59:03 2022 +0100 implememted clean solution for parsing open tables. still needs final refactoring.
2022-02-05 19:32:47 +01:00 · 2022-02-05 19:32:47 +01:00 · 00748a8ac0
commit 00748a8ac0
parent 224360c823
7 changed files with 126 additions and 76 deletions
--- a/README.md
+++ b/README.md
@ -23,7 +23,11 @@ dvc pull
 ### As an API

 The module provided functions for the individual tasks that all return some kid of collection of points, depending on
-the specific task. Example for finding the outlines of previous redactions.
+the specific task.
+
+#### Redaction Detection
+
+The below snippet shows hot to find the outlines of previous redactions.

 ```python

@ -31,10 +35,10 @@ from vidocp.redaction_detection import find_redactions
 import pdf2image 
 import numpy as np

+
 pdf_path = ...
 page_index = ...

-
 page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
 page = np.array(page)

@ -52,13 +56,17 @@ Core API functionalities can be used through a CLI.

 The tables parsing utility detects and segments tables into individual cells.
 ```bash
-python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
+python scripts/annotate.py data/test_pdf.pdf 7 --type table
 ```

+The below image shows a parsed table, where each table cell has been detected individually.
+
+![](data/table_parsing.png)
+

 #### Redaction Detection

-The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
+The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
 ```bash
 python scripts/annotate.py <path to pdf> 0 --type redaction
 ```
--- a/data/table_parsing.png
+++ b/data/table_parsing.png
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@ -1,6 +1,6 @@
 import argparse

-from vidocp.table_parsig import annotate_tables_in_pdf
+from vidocp.table_parsing import annotate_tables_in_pdf
 from vidocp.redaction_detection import annotate_boxes_in_pdf
 from vidocp.layout_detection import annotate_layout_in_pdf

--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@ -4,7 +4,8 @@ import cv2
 import numpy as np
 import pdf2image
 from iteration_utilities import starfilter, first
-from matplotlib import pyplot as plt
+
+from vidocp.utils import show_mpl, draw_contours


 def is_filled(hierarchy):
@ -42,22 +43,12 @@ def find_redactions(image: np.array, min_normalized_area=200000):
    return contours


-def annotate_poly(image, contours):
-    for cont in contours:
-        cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
-
-    return image
-
-
 def annotate_boxes_in_pdf(pdf_path, page_index=1):

    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)

    redaction_contours = find_redactions(page)
-    page = annotate_poly(page, redaction_contours)
+    page = draw_contours(page, redaction_contours)

-    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
-    ax.imshow(page)
-    plt.show()
+    show_mpl(page)
--- a/vidocp/table_parsig.py
+++ b/vidocp/table_parsig.py
@ -1,58 +0,0 @@
-from itertools import count
-
-import cv2
-import numpy as np
-import pdf2image
-from matplotlib import pyplot as plt
-
-
-def parse(image: np.array):
-
-    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
-    img_bin = ~img_bin
-
-    line_min_width = 4
-    kernel_h = np.ones((1, line_min_width), np.uint8)
-    kernel_v = np.ones((line_min_width, 1), np.uint8)
-
-    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
-    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-
-    img_bin_final = img_bin_h | img_bin_v
-
-    _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
-
-    return labels, stats
-
-
-def parse_tables_in_pdf(pages):
-    return zip(map(parse, pages), count())
-
-
-def annotate_image(image, stats):
-    for x, y, w, h, area in stats[2:]:
-        if w > 10 and h > 10:
-            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
-
-            for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
-                anno = f"{s} = {v}"
-                xann = int(x + 5)
-                yann = int(y + h - (20 * (i + 1)))
-                cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
-
-    return image
-
-
-def annotate_tables_in_pdf(pdf_path, page_index=1):
-
-    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
-    page = np.array(page)
-
-    _, stats = parse(page)
-    page = annotate_image(page, stats)
-
-    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
-    ax.imshow(page)
-    plt.show()
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@ -0,0 +1,55 @@
+import cv2
+import numpy as np
+from pdf2image import pdf2image
+
+from vidocp.utils import show_cv2, draw_stats
+
+
+def add_external_contours(image, img):
+
+    contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+
+    for cnt in contours:
+        x, y, w, h = cv2.boundingRect(cnt)
+        cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
+
+    return image
+
+
+def isolate_vertical_and_horizontal_components(img_bin):
+
+    line_min_width = 30
+    kernel_h = np.ones((1, line_min_width), np.uint8)
+    kernel_v = np.ones((line_min_width, 1), np.uint8)
+
+    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
+    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+
+    img_bin_final = img_bin_h | img_bin_v
+
+    return img_bin_final
+
+
+def parse_table(image: np.array):
+
+    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
+    img_bin = ~img_bin
+
+    img_bin = isolate_vertical_and_horizontal_components(img_bin)
+    img_bin_final = add_external_contours(img_bin, img_bin)
+
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
+
+    return stats
+
+
+def annotate_tables_in_pdf(pdf_path, page_index=1):
+
+    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
+    page = np.array(page)
+
+    stats = parse_table(page)
+    page = draw_stats(page, stats)
+
+    show_cv2(page)
--- a/vidocp/utils.py
+++ b/vidocp/utils.py
@ -0,0 +1,54 @@
+import cv2
+from matplotlib import pyplot as plt
+
+
+def show_mpl(image):
+
+    fig, ax = plt.subplots(1, 1)
+    fig.set_size_inches(20, 20)
+    ax.imshow(image)
+    plt.show()
+
+
+def show_cv2(image):
+
+    cv2.imshow("", image)
+    cv2.waitKey(0)
+
+
+def draw_contours(image, contours):
+
+    image = image.copy()
+
+    for cont in contours:
+        cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
+
+    return image
+
+
+def draw_stats(image, stats, annotate=False):
+
+    image = image.copy()
+    keys = ["x", "y", "w", "h"]
+
+    def annotate_stat(x, y, w, h):
+
+        for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
+            anno = f"{s} = {v}"
+            xann = int(x + 5)
+            yann = int(y + h - (20 * (i + 1)))
+            cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
+
+    def draw_stat(stat):
+
+        x, y, w, h, area = stat
+
+        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
+
+        if annotate:
+            annotate_stat(x, y, w, h)
+
+    for stat in stats[2:]:
+        draw_stat(stat)
+
+    return image