diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..cb89cc4 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,7 @@ +[core] + remote = vector + autostage = true +['remote "vector"'] + url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/ + port = 22 + diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json new file mode 100644 index 0000000..af1b48d --- /dev/null +++ b/.dvc/plots/confusion.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "max", + "field": "xy_count", + "as": "max_count" + } + ], + "groupby": [] + }, + { + "calculate": "datum.xy_count / datum.max_count", + "as": "percent_of_max" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "xy_count", + "type": "quantitative", + "title": "", + "scale": { + "domainMin": 0, + "nice": true + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "xy_count", + "type": "quantitative" + }, + "color": { + "condition": { + "test": "datum.percent_of_max > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json new file mode 100644 index 0000000..1d38849 --- /dev/null +++ b/.dvc/plots/confusion_normalized.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "sum", + "field": "xy_count", + "as": "sum_y" + } + ], + "groupby": [ + "" + ] + }, + { + "calculate": "datum.xy_count / datum.sum_y", + "as": "percent_of_y" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "percent_of_y", + "type": "quantitative", + "title": "", + "scale": { + "domain": [ + 0, + 1 + ] + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "percent_of_y", + "type": "quantitative", + "format": ".2f" + }, + "color": { + "condition": { + "test": "datum.percent_of_y > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json new file mode 100644 index 0000000..65549f9 --- /dev/null +++ b/.dvc/plots/linear.json @@ -0,0 +1,116 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "line" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "mark": { + "type": "rule", + "color": "gray" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative" + } + } + }, + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json new file mode 100644 index 0000000..9af9304 --- /dev/null +++ b/.dvc/plots/scatter.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "point" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/simple.json b/.dvc/plots/simple.json new file mode 100644 index 0000000..9cf71ce --- /dev/null +++ b/.dvc/plots/simple.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + } +} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json new file mode 100644 index 0000000..d497ce7 --- /dev/null +++ b/.dvc/plots/smooth.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "transform": [ + { + "loess": "", + "on": "", + "groupby": [ + "rev" + ], + "bandwidth": 0.3 + } + ] +} diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/README.md b/README.md new file mode 100644 index 0000000..549a674 --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +# Vidocp — Visual Document Parsing + +This repository implements computer vision based approaches for detecting and parsing visual features such as tables or +previous redactions in documents. + +## Installation + +```bash +git clone ssh://git@git.iqser.com:2222/rr/vidocp.git +cd vidocp + +python -m venv env +source env/bin/activate + +pip install -e . +pip install -r requirements.txt + +dvc pull +``` + +## Usage + +### As an API + +The module provided functions for the individual tasks that all return some kid of collection of points, depending on +the specific task. Example for finding the outlines of previous redactions. + +```python + +from vidocp.redaction_detection import find_redactions +import pdf2image +import numpy as np + +pdf_path = ... +page_index = ... + + +page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0] +page = np.array(page) + +redaction_contours = find_redactions(page) +``` + + +### As a CLI Tool + + +Core API functionalities can be used through a CLI. + + +#### Table Parsing + +The tables parsing utility detects and segments tables into individual cells. +```bash +python scripts/annotate.py data/test_pdf.pdf 2 --type redaction +``` + + +#### Redaction Detection + +The redaction detection utility detects previous redactions in PDFs (black filled rectangles). +```bash +python scripts/annotate.py 0 --type redaction +``` + +The below image shows the detected redactions with green outlines. + +![](data/redaction_detection.png) diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..09d8485 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/test_pdf.pdf diff --git a/data/redaction_detection.png b/data/redaction_detection.png new file mode 100644 index 0000000..6df3f30 Binary files /dev/null and b/data/redaction_detection.png differ diff --git a/data/test_pdf.pdf.dvc b/data/test_pdf.pdf.dvc new file mode 100644 index 0000000..4eff9a4 --- /dev/null +++ b/data/test_pdf.pdf.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 60840305e4ddb084aea21976b8b7c49e + size: 6916053 + path: test_pdf.pdf diff --git a/requirements.txt b/requirements.txt index 2be77e5..913a63f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,8 @@ opencv-python~=4.5.5.62 numpy~=1.22.1 pdf2image~=1.16.0 matplotlib~=3.5.1 -imutils~=0.5.4 \ No newline at end of file +imutils==0.5.4 +iteration-utilities==0.11.0 +dvc==2.9.3 +dvc[ssh] + diff --git a/scripts/annotate.py b/scripts/annotate.py new file mode 100644 index 0000000..95de313 --- /dev/null +++ b/scripts/annotate.py @@ -0,0 +1,26 @@ +import argparse + +from vidocp.table_parsing_2 import annotate_tables_in_pdf +from vidocp.redaction_detection import annotate_boxes_in_pdf +from vidocp.layout_detection import annotate_layout_in_pdf + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("pdf_path") + parser.add_argument("page_index", type=int) + parser.add_argument("--type", choices=["table", "redaction", "layout"], default="table") + + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + args = parse_args() + if args.type == "table": + annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) + elif args.type == "redaction": + annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index) + elif args.type == "layout": + annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index) diff --git a/scripts/annotate_table.py b/scripts/annotate_table.py deleted file mode 100644 index 44e0985..0000000 --- a/scripts/annotate_table.py +++ /dev/null @@ -1,18 +0,0 @@ -import argparse - -from table_parsing.table_parsig import annotate_tables_in_pdf - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("pdf_path") - parser.add_argument("page_index", type=int) - - args = parser.parse_args() - - return args - - -if __name__ == "__main__": - args = parse_args() - annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) diff --git a/setup.py b/setup.py index 0cd4f7f..9fc73a8 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,11 @@ from distutils.core import setup setup( - name="table_parsing", + name="vidocp", version="0.0.1", description="", author="", author_email="", url="", - packages=["table_parsing"], + packages=["vidocp"], ) diff --git a/table_parsing/__init__.py b/vidocp/__init__.py similarity index 100% rename from table_parsing/__init__.py rename to vidocp/__init__.py diff --git a/vidocp/layout_detection.py b/vidocp/layout_detection.py new file mode 100644 index 0000000..40e9e58 --- /dev/null +++ b/vidocp/layout_detection.py @@ -0,0 +1,53 @@ +from itertools import count + +import cv2 +import numpy as np +import pdf2image +from matplotlib import pyplot as plt +import imutils + + +def find_layout_boxes(image: np.array): + + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) + thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] + img_bin = ~thresh + + line_min_width = 10 + kernel_h = np.ones((10, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 10), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + + img_bin_final = img_bin_h | img_bin_v + + contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = imutils.grab_contours(contours) + for c in contours: + peri = cv2.arcLength(c, True) + approx = cv2.approxPolyDP(c, 0.04 * peri, True) + yield cv2.boundingRect(approx) + + +def annotate_layout_boxes(image, rects): + for rect in rects: + (x, y, w, h) = rect + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) + + return image + + +def annotate_layout_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + layout_boxes = find_layout_boxes(page) + page = annotate_layout_boxes(page, layout_boxes) + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(page) + plt.show() diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py new file mode 100644 index 0000000..b071c93 --- /dev/null +++ b/vidocp/redaction_detection.py @@ -0,0 +1,63 @@ +from functools import partial + +import cv2 +import numpy as np +import pdf2image +from iteration_utilities import starfilter, first +from matplotlib import pyplot as plt + + +def is_filled(hierarchy): + # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + return hierarchy[3] <= 0 and hierarchy[2] == -1 + + +def is_boxy(contour): + epsilon = 0.01 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + return len(approx) <= 10 + + +def is_large_enough(contour, min_area): + return cv2.contourArea(contour, False) > min_area + + +def is_likely_redaction(contour, hierarchy, min_area): + return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area) + + +def find_redactions(image: np.array, min_normalized_area=200000): + + min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution + + gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + blurred = cv2.GaussianBlur(gray, (5, 5), 1) + thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1] + + contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) + + contours = map( + first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0])) + ) + return contours + + +def annotate_poly(image, contours): + for cont in contours: + cv2.drawContours(image, cont, -1, (0, 255, 0), 4) + + return image + + +def annotate_boxes_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + redaction_contours = find_redactions(page) + page = annotate_poly(page, redaction_contours) + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(page) + plt.show() diff --git a/table_parsing/table_parsig.py b/vidocp/table_parsig.py similarity index 100% rename from table_parsing/table_parsig.py rename to vidocp/table_parsig.py diff --git a/vidocp/table_parsing_2.py b/vidocp/table_parsing_2.py new file mode 100644 index 0000000..8b035bf --- /dev/null +++ b/vidocp/table_parsing_2.py @@ -0,0 +1,74 @@ +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from pdf2image import pdf2image + + +def add_external_contours(image, img): + + contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) + + return image + + +def isolate_vertical_and_horizontal_components(img_bin): + + line_min_width = 30 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + + img_bin_final = img_bin_h | img_bin_v + + return img_bin_final + + +def annotate_image(image, stats): + + image = image.copy() + + for x, y, w, h, area in stats[2:]: + if w > 10 and h > 10: + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + + return image + + +def parse_table(image: np.array): + + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + img_bin = isolate_vertical_and_horizontal_components(img_bin) + img_bin_final = add_external_contours(img_bin, img_bin) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + + return stats + + +def annotate_tables_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + stats = parse_table(page) + page = annotate_image(page, stats) + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(page) + plt.show()