Merge branch 'table_parsing_version_2' of ssh://git.iqser.com:2222/rr/table_parsing into uncommon-tables

Conflicts: requirements.txt
2022-02-05 18:03:14 +01:00 · 2022-02-05 18:03:14 +01:00 · 17b8e3a16e
commit 17b8e3a16e
parent b569b03572 443163864b
22 changed files with 818 additions and 21 deletions
--- a/.dvc/.gitignore
+++ b/.dvc/.gitignore
@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
--- a/.dvc/config
+++ b/.dvc/config
@ -0,0 +1,7 @@
+[core]
+    remote = vector
+    autostage = true
+['remote "vector"']
+    url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
+    port = 22
+
--- a/.dvc/plots/confusion.json
+++ b/.dvc/plots/confusion.json
@ -0,0 +1,107 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "facet": {
+        "field": "rev",
+        "type": "nominal"
+    },
+    "spec": {
+        "transform": [
+            {
+                "aggregate": [
+                    {
+                        "op": "count",
+                        "as": "xy_count"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>",
+                    "<DVC_METRIC_X>"
+                ]
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_Y>"
+                ],
+                "key": "<DVC_METRIC_X>",
+                "value": 0
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_X>"
+                ],
+                "key": "<DVC_METRIC_Y>",
+                "value": 0
+            },
+            {
+                "joinaggregate": [
+                    {
+                        "op": "max",
+                        "field": "xy_count",
+                        "as": "max_count"
+                    }
+                ],
+                "groupby": []
+            },
+            {
+                "calculate": "datum.xy_count / datum.max_count",
+                "as": "percent_of_max"
+            }
+        ],
+        "encoding": {
+            "x": {
+                "field": "<DVC_METRIC_X>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_X_LABEL>"
+            },
+            "y": {
+                "field": "<DVC_METRIC_Y>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_Y_LABEL>"
+            }
+        },
+        "layer": [
+            {
+                "mark": "rect",
+                "width": 300,
+                "height": 300,
+                "encoding": {
+                    "color": {
+                        "field": "xy_count",
+                        "type": "quantitative",
+                        "title": "",
+                        "scale": {
+                            "domainMin": 0,
+                            "nice": true
+                        }
+                    }
+                }
+            },
+            {
+                "mark": "text",
+                "encoding": {
+                    "text": {
+                        "field": "xy_count",
+                        "type": "quantitative"
+                    },
+                    "color": {
+                        "condition": {
+                            "test": "datum.percent_of_max > 0.5",
+                            "value": "white"
+                        },
+                        "value": "black"
+                    }
+                }
+            }
+        ]
+    }
+}
--- a/.dvc/plots/confusion_normalized.json
+++ b/.dvc/plots/confusion_normalized.json
@ -0,0 +1,112 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "facet": {
+        "field": "rev",
+        "type": "nominal"
+    },
+    "spec": {
+        "transform": [
+            {
+                "aggregate": [
+                    {
+                        "op": "count",
+                        "as": "xy_count"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>",
+                    "<DVC_METRIC_X>"
+                ]
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_Y>"
+                ],
+                "key": "<DVC_METRIC_X>",
+                "value": 0
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_X>"
+                ],
+                "key": "<DVC_METRIC_Y>",
+                "value": 0
+            },
+            {
+                "joinaggregate": [
+                    {
+                        "op": "sum",
+                        "field": "xy_count",
+                        "as": "sum_y"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>"
+                ]
+            },
+            {
+                "calculate": "datum.xy_count / datum.sum_y",
+                "as": "percent_of_y"
+            }
+        ],
+        "encoding": {
+            "x": {
+                "field": "<DVC_METRIC_X>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_X_LABEL>"
+            },
+            "y": {
+                "field": "<DVC_METRIC_Y>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_Y_LABEL>"
+            }
+        },
+        "layer": [
+            {
+                "mark": "rect",
+                "width": 300,
+                "height": 300,
+                "encoding": {
+                    "color": {
+                        "field": "percent_of_y",
+                        "type": "quantitative",
+                        "title": "",
+                        "scale": {
+                            "domain": [
+                                0,
+                                1
+                            ]
+                        }
+                    }
+                }
+            },
+            {
+                "mark": "text",
+                "encoding": {
+                    "text": {
+                        "field": "percent_of_y",
+                        "type": "quantitative",
+                        "format": ".2f"
+                    },
+                    "color": {
+                        "condition": {
+                            "test": "datum.percent_of_y > 0.5",
+                            "value": "white"
+                        },
+                        "value": "black"
+                    }
+                }
+            }
+        ]
+    }
+}
--- a/.dvc/plots/linear.json
+++ b/.dvc/plots/linear.json
@ -0,0 +1,116 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "layer": [
+        {
+            "encoding": {
+                "x": {
+                    "field": "<DVC_METRIC_X>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_X_LABEL>"
+                },
+                "y": {
+                    "field": "<DVC_METRIC_Y>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_Y_LABEL>",
+                    "scale": {
+                        "zero": false
+                    }
+                },
+                "color": {
+                    "field": "rev",
+                    "type": "nominal"
+                }
+            },
+            "layer": [
+                {
+                    "mark": "line"
+                },
+                {
+                    "selection": {
+                        "label": {
+                            "type": "single",
+                            "nearest": true,
+                            "on": "mouseover",
+                            "encodings": [
+                                "x"
+                            ],
+                            "empty": "none",
+                            "clear": "mouseout"
+                        }
+                    },
+                    "mark": "point",
+                    "encoding": {
+                        "opacity": {
+                            "condition": {
+                                "selection": "label",
+                                "value": 1
+                            },
+                            "value": 0
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "transform": [
+                {
+                    "filter": {
+                        "selection": "label"
+                    }
+                }
+            ],
+            "layer": [
+                {
+                    "mark": {
+                        "type": "rule",
+                        "color": "gray"
+                    },
+                    "encoding": {
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        }
+                    }
+                },
+                {
+                    "encoding": {
+                        "text": {
+                            "type": "quantitative",
+                            "field": "<DVC_METRIC_Y>"
+                        },
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        },
+                        "y": {
+                            "field": "<DVC_METRIC_Y>",
+                            "type": "quantitative"
+                        }
+                    },
+                    "layer": [
+                        {
+                            "mark": {
+                                "type": "text",
+                                "align": "left",
+                                "dx": 5,
+                                "dy": -5
+                            },
+                            "encoding": {
+                                "color": {
+                                    "type": "nominal",
+                                    "field": "rev"
+                                }
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
--- a/.dvc/plots/scatter.json
+++ b/.dvc/plots/scatter.json
@ -0,0 +1,104 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "layer": [
+        {
+            "encoding": {
+                "x": {
+                    "field": "<DVC_METRIC_X>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_X_LABEL>"
+                },
+                "y": {
+                    "field": "<DVC_METRIC_Y>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_Y_LABEL>",
+                    "scale": {
+                        "zero": false
+                    }
+                },
+                "color": {
+                    "field": "rev",
+                    "type": "nominal"
+                }
+            },
+            "layer": [
+                {
+                    "mark": "point"
+                },
+                {
+                    "selection": {
+                        "label": {
+                            "type": "single",
+                            "nearest": true,
+                            "on": "mouseover",
+                            "encodings": [
+                                "x"
+                            ],
+                            "empty": "none",
+                            "clear": "mouseout"
+                        }
+                    },
+                    "mark": "point",
+                    "encoding": {
+                        "opacity": {
+                            "condition": {
+                                "selection": "label",
+                                "value": 1
+                            },
+                            "value": 0
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "transform": [
+                {
+                    "filter": {
+                        "selection": "label"
+                    }
+                }
+            ],
+            "layer": [
+                {
+                    "encoding": {
+                        "text": {
+                            "type": "quantitative",
+                            "field": "<DVC_METRIC_Y>"
+                        },
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        },
+                        "y": {
+                            "field": "<DVC_METRIC_Y>",
+                            "type": "quantitative"
+                        }
+                    },
+                    "layer": [
+                        {
+                            "mark": {
+                                "type": "text",
+                                "align": "left",
+                                "dx": 5,
+                                "dy": -5
+                            },
+                            "encoding": {
+                                "color": {
+                                    "type": "nominal",
+                                    "field": "rev"
+                                }
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
--- a/.dvc/plots/simple.json
+++ b/.dvc/plots/simple.json
@ -0,0 +1,31 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
--- a/.dvc/plots/smooth.json
+++ b/.dvc/plots/smooth.json
@ -0,0 +1,39 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    },
+    "transform": [
+        {
+            "loess": "<DVC_METRIC_Y>",
+            "on": "<DVC_METRIC_X>",
+            "groupby": [
+                "rev"
+            ],
+            "bandwidth": 0.3
+        }
+    ]
+}
--- a/.dvcignore
+++ b/.dvcignore
@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
--- a/README.md
+++ b/README.md
@ -0,0 +1,68 @@
+# Vidocp &mdash; Visual Document Parsing
+
+This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
+previous redactions in documents.
+
+## Installation
+
+```bash
+git clone ssh://git@git.iqser.com:2222/rr/vidocp.git
+cd vidocp
+
+python -m venv env
+source env/bin/activate
+
+pip install -e .
+pip install -r requirements.txt
+
+dvc pull
+```
+
+## Usage
+
+### As an API
+
+The module provided functions for the individual tasks that all return some kid of collection of points, depending on
+the specific task. Example for finding the outlines of previous redactions.
+
+```python
+
+from vidocp.redaction_detection import find_redactions
+import pdf2image 
+import numpy as np
+
+pdf_path = ...
+page_index = ...
+
+
+page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
+page = np.array(page)
+
+redaction_contours = find_redactions(page)
+```
+
+
+### As a CLI Tool
+
+
+Core API functionalities can be used through a CLI.
+
+
+#### Table Parsing
+
+The tables parsing utility detects and segments tables into individual cells.
+```bash
+python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
+```
+
+
+#### Redaction Detection
+
+The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
+```bash
+python scripts/annotate.py <path to pdf> 0 --type redaction
+```
+
+The below image shows the detected redactions with green outlines.
+
+![](data/redaction_detection.png)
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
+/test_pdf.pdf
--- a/data/redaction_detection.png
+++ b/data/redaction_detection.png
--- a/data/test_pdf.pdf.dvc
+++ b/data/test_pdf.pdf.dvc
@ -0,0 +1,4 @@
+outs:
+- md5: 60840305e4ddb084aea21976b8b7c49e
+  size: 6916053
+  path: test_pdf.pdf
--- a/requirements.txt
+++ b/requirements.txt
@ -2,4 +2,8 @@ opencv-python~=4.5.5.62
 numpy~=1.22.1
 pdf2image~=1.16.0
 matplotlib~=3.5.1
-imutils~=0.5.4
+imutils==0.5.4
+iteration-utilities==0.11.0
+dvc==2.9.3
+dvc[ssh]
+
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@ -0,0 +1,26 @@
+import argparse
+
+from vidocp.table_parsing_2 import annotate_tables_in_pdf
+from vidocp.redaction_detection import annotate_boxes_in_pdf
+from vidocp.layout_detection import annotate_layout_in_pdf
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("pdf_path")
+    parser.add_argument("page_index", type=int)
+    parser.add_argument("--type", choices=["table", "redaction", "layout"], default="table")
+
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.type == "table":
+        annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
+    elif args.type == "redaction":
+        annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index)
+    elif args.type == "layout":
+        annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index)
--- a/scripts/annotate_table.py
+++ b/scripts/annotate_table.py
@ -1,18 +0,0 @@
-import argparse
-
-from table_parsing.table_parsig import annotate_tables_in_pdf
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("pdf_path")
-    parser.add_argument("page_index", type=int)
-
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
--- a/setup.py
+++ b/setup.py
@ -3,11 +3,11 @@
 from distutils.core import setup

 setup(
-    name="table_parsing",
+    name="vidocp",
    version="0.0.1",
    description="",
    author="",
    author_email="",
    url="",
-    packages=["table_parsing"],
+    packages=["vidocp"],
 )
--- a/table_parsing/init.py
+++ b/table_parsing/init.py
--- a/vidocp/layout_detection.py
+++ b/vidocp/layout_detection.py
@ -0,0 +1,53 @@
+from itertools import count
+
+import cv2
+import numpy as np
+import pdf2image
+from matplotlib import pyplot as plt
+import imutils
+
+
+def find_layout_boxes(image: np.array):
+
+    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1)
+    thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1]
+    img_bin = ~thresh
+
+    line_min_width = 10
+    kernel_h = np.ones((10, line_min_width), np.uint8)
+    kernel_v = np.ones((line_min_width, 10), np.uint8)
+
+    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
+    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+
+    img_bin_final = img_bin_h | img_bin_v
+
+    contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contours = imutils.grab_contours(contours)
+    for c in contours:
+        peri = cv2.arcLength(c, True)
+        approx = cv2.approxPolyDP(c, 0.04 * peri, True)
+        yield cv2.boundingRect(approx)
+
+
+def annotate_layout_boxes(image, rects):
+    for rect in rects:
+        (x, y, w, h) = rect
+        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
+
+    return image
+
+
+def annotate_layout_in_pdf(pdf_path, page_index=1):
+
+    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
+    page = np.array(page)
+
+    layout_boxes = find_layout_boxes(page)
+    page = annotate_layout_boxes(page, layout_boxes)
+
+    fig, ax = plt.subplots(1, 1)
+    fig.set_size_inches(20, 20)
+    ax.imshow(page)
+    plt.show()
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@ -0,0 +1,63 @@
+from functools import partial
+
+import cv2
+import numpy as np
+import pdf2image
+from iteration_utilities import starfilter, first
+from matplotlib import pyplot as plt
+
+
+def is_filled(hierarchy):
+    # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
+    return hierarchy[3] <= 0 and hierarchy[2] == -1
+
+
+def is_boxy(contour):
+    epsilon = 0.01 * cv2.arcLength(contour, True)
+    approx = cv2.approxPolyDP(contour, epsilon, True)
+    return len(approx) <= 10
+
+
+def is_large_enough(contour, min_area):
+    return cv2.contourArea(contour, False) > min_area
+
+
+def is_likely_redaction(contour, hierarchy, min_area):
+    return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
+
+
+def find_redactions(image: np.array, min_normalized_area=200000):
+
+    min_normalized_area /= 200  # Assumes 200 DPI PDF -> image conversion resolution
+
+    gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    blurred = cv2.GaussianBlur(gray, (5, 5), 1)
+    thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]
+
+    contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
+
+    contours = map(
+        first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
+    )
+    return contours
+
+
+def annotate_poly(image, contours):
+    for cont in contours:
+        cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
+
+    return image
+
+
+def annotate_boxes_in_pdf(pdf_path, page_index=1):
+
+    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
+    page = np.array(page)
+
+    redaction_contours = find_redactions(page)
+    page = annotate_poly(page, redaction_contours)
+
+    fig, ax = plt.subplots(1, 1)
+    fig.set_size_inches(20, 20)
+    ax.imshow(page)
+    plt.show()
--- a/table_parsing/table_parsig.py
+++ b/table_parsing/table_parsig.py
--- a/vidocp/table_parsing_2.py
+++ b/vidocp/table_parsing_2.py
@ -0,0 +1,74 @@
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from pdf2image import pdf2image
+
+
+def add_external_contours(image, img):
+
+    contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+
+    for cnt in contours:
+        x, y, w, h = cv2.boundingRect(cnt)
+        cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
+
+    return image
+
+
+def isolate_vertical_and_horizontal_components(img_bin):
+
+    line_min_width = 30
+    kernel_h = np.ones((1, line_min_width), np.uint8)
+    kernel_v = np.ones((line_min_width, 1), np.uint8)
+
+    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
+    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+
+    img_bin_final = img_bin_h | img_bin_v
+
+    return img_bin_final
+
+
+def annotate_image(image, stats):
+
+    image = image.copy()
+
+    for x, y, w, h, area in stats[2:]:
+        if w > 10 and h > 10:
+            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
+
+            for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
+                anno = f"{s} = {v}"
+                xann = int(x + 5)
+                yann = int(y + h - (20 * (i + 1)))
+                cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
+
+    return image
+
+
+def parse_table(image: np.array):
+
+    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
+    img_bin = ~img_bin
+
+    img_bin = isolate_vertical_and_horizontal_components(img_bin)
+    img_bin_final = add_external_contours(img_bin, img_bin)
+
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
+
+    return stats
+
+
+def annotate_tables_in_pdf(pdf_path, page_index=1):
+
+    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
+    page = np.array(page)
+
+    stats = parse_table(page)
+    page = annotate_image(page, stats)
+
+    fig, ax = plt.subplots(1, 1)
+    fig.set_size_inches(20, 20)
+    ax.imshow(page)
+    plt.show()