diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..cb89cc4 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,7 @@ +[core] + remote = vector + autostage = true +['remote "vector"'] + url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/ + port = 22 + diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json new file mode 100644 index 0000000..af1b48d --- /dev/null +++ b/.dvc/plots/confusion.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "max", + "field": "xy_count", + "as": "max_count" + } + ], + "groupby": [] + }, + { + "calculate": "datum.xy_count / datum.max_count", + "as": "percent_of_max" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "xy_count", + "type": "quantitative", + "title": "", + "scale": { + "domainMin": 0, + "nice": true + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "xy_count", + "type": "quantitative" + }, + "color": { + "condition": { + "test": "datum.percent_of_max > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json new file mode 100644 index 0000000..1d38849 --- /dev/null +++ b/.dvc/plots/confusion_normalized.json @@ -0,0 +1,112 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "facet": { + "field": "rev", + "type": "nominal" + }, + "spec": { + "transform": [ + { + "aggregate": [ + { + "op": "count", + "as": "xy_count" + } + ], + "groupby": [ + "", + "" + ] + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "impute": "xy_count", + "groupby": [ + "rev", + "" + ], + "key": "", + "value": 0 + }, + { + "joinaggregate": [ + { + "op": "sum", + "field": "xy_count", + "as": "sum_y" + } + ], + "groupby": [ + "" + ] + }, + { + "calculate": "datum.xy_count / datum.sum_y", + "as": "percent_of_y" + } + ], + "encoding": { + "x": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + }, + "y": { + "field": "", + "type": "nominal", + "sort": "ascending", + "title": "" + } + }, + "layer": [ + { + "mark": "rect", + "width": 300, + "height": 300, + "encoding": { + "color": { + "field": "percent_of_y", + "type": "quantitative", + "title": "", + "scale": { + "domain": [ + 0, + 1 + ] + } + } + } + }, + { + "mark": "text", + "encoding": { + "text": { + "field": "percent_of_y", + "type": "quantitative", + "format": ".2f" + }, + "color": { + "condition": { + "test": "datum.percent_of_y > 0.5", + "value": "white" + }, + "value": "black" + } + } + } + ] + } +} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json new file mode 100644 index 0000000..65549f9 --- /dev/null +++ b/.dvc/plots/linear.json @@ -0,0 +1,116 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "line" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "mark": { + "type": "rule", + "color": "gray" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative" + } + } + }, + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json new file mode 100644 index 0000000..9af9304 --- /dev/null +++ b/.dvc/plots/scatter.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "layer": [ + { + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "layer": [ + { + "mark": "point" + }, + { + "selection": { + "label": { + "type": "single", + "nearest": true, + "on": "mouseover", + "encodings": [ + "x" + ], + "empty": "none", + "clear": "mouseout" + } + }, + "mark": "point", + "encoding": { + "opacity": { + "condition": { + "selection": "label", + "value": 1 + }, + "value": 0 + } + } + } + ] + }, + { + "transform": [ + { + "filter": { + "selection": "label" + } + } + ], + "layer": [ + { + "encoding": { + "text": { + "type": "quantitative", + "field": "" + }, + "x": { + "field": "", + "type": "quantitative" + }, + "y": { + "field": "", + "type": "quantitative" + } + }, + "layer": [ + { + "mark": { + "type": "text", + "align": "left", + "dx": 5, + "dy": -5 + }, + "encoding": { + "color": { + "type": "nominal", + "field": "rev" + } + } + } + ] + } + ] + } + ] +} diff --git a/.dvc/plots/simple.json b/.dvc/plots/simple.json new file mode 100644 index 0000000..9cf71ce --- /dev/null +++ b/.dvc/plots/simple.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "width": 300, + "height": 300, + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + } +} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json new file mode 100644 index 0000000..d497ce7 --- /dev/null +++ b/.dvc/plots/smooth.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v4.json", + "data": { + "values": "" + }, + "title": "", + "mark": { + "type": "line" + }, + "encoding": { + "x": { + "field": "", + "type": "quantitative", + "title": "" + }, + "y": { + "field": "", + "type": "quantitative", + "title": "", + "scale": { + "zero": false + } + }, + "color": { + "field": "rev", + "type": "nominal" + } + }, + "transform": [ + { + "loess": "", + "on": "", + "groupby": [ + "rev" + ], + "bandwidth": 0.3 + } + ] +} diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/README.md b/README.md index c66a861..1c0b947 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# Table Parsing +# Vidocp This repository implements computer vision based approaches for detecting and parsing visual features such as tables or -previous redactions in PDFs. +previous redactions in documents. ## Installation ```bash -git clone ssh://git@git.iqser.com:2222/rr/table_parsing.git -cd table_parsing +git clone ssh://git@git.iqser.com:2222/rr/vidocp.git +cd vidocp python -m venv env source env/bin/activate @@ -18,10 +18,48 @@ pip install -r requirements.txt ## Usage -```bash -# Parse tables on second page of a PDF -python scripts/annotate.py 1 --type table +### As an API -# Detect redactions (black filled rectangles) on first page of a PDF +The module provided functions for the individual tasks that all return some kid of collection of points, depending on +the specific task. Example for finding the outlines of previous redactions. + +```python + +from vidocp.redaction_detection import find_redactions +import pdf2image +import numpy as np + +pdf_path = ... +page_index = ... + + +page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0] +page = np.array(page) + +redaction_contours = find_redactions(page) +``` + + + + +### Example outputs from demo script: + + +#### Table parsing + +The tables parsing utility detects and segments tables into individual cells. +```bash +python scripts/annotate.py 1 --type table +``` + + +#### Detect redactions + +The redaction detection utility detects previous redactions in PDFs (black filled rectangles). +```bash python scripts/annotate.py 0 --type redaction ``` + +The below image shows the detected redactions with green outlines. + +![](data/redaction_detection.png) diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..09d8485 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/test_pdf.pdf diff --git a/data/redaction_detection.png b/data/redaction_detection.png new file mode 100644 index 0000000..6df3f30 Binary files /dev/null and b/data/redaction_detection.png differ diff --git a/data/test_pdf.pdf.dvc b/data/test_pdf.pdf.dvc new file mode 100644 index 0000000..4eff9a4 --- /dev/null +++ b/data/test_pdf.pdf.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 60840305e4ddb084aea21976b8b7c49e + size: 6916053 + path: test_pdf.pdf diff --git a/requirements.txt b/requirements.txt index a3f596f..913a63f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,6 @@ pdf2image~=1.16.0 matplotlib~=3.5.1 imutils==0.5.4 iteration-utilities==0.11.0 +dvc==2.9.3 +dvc[ssh] + diff --git a/scripts/annotate.py b/scripts/annotate.py index 9a8b048..4c6d7b8 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,8 +1,8 @@ import argparse -from table_parsing.table_parsig import annotate_tables_in_pdf -from box_detection.redaction_detection import annotate_boxes_in_pdf -from layout_detection.layout_detection import annotate_layout_in_pdf +from vidocp.table_parsig import annotate_tables_in_pdf +from vidocp.redaction_detection import annotate_boxes_in_pdf +from vidocp.layout_detection import annotate_layout_in_pdf def parse_args(): diff --git a/setup.py b/setup.py index 0cd4f7f..9fc73a8 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,11 @@ from distutils.core import setup setup( - name="table_parsing", + name="vidocp", version="0.0.1", description="", author="", author_email="", url="", - packages=["table_parsing"], + packages=["vidocp"], ) diff --git a/table_parsing/__init__.py b/table_parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/box_detection/__init__.py b/vidocp/__init__.py similarity index 100% rename from box_detection/__init__.py rename to vidocp/__init__.py diff --git a/layout_detection/layout_detection.py b/vidocp/layout_detection.py similarity index 100% rename from layout_detection/layout_detection.py rename to vidocp/layout_detection.py diff --git a/box_detection/redaction_detection.py b/vidocp/redaction_detection.py similarity index 93% rename from box_detection/redaction_detection.py rename to vidocp/redaction_detection.py index 6b1c390..b071c93 100644 --- a/box_detection/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -42,9 +42,9 @@ def find_redactions(image: np.array, min_normalized_area=200000): return contours -def annotate_poly(image, conts): - for cont in conts: - cv2.drawContours(image, cont, -1, (0, 255, 0), 2) +def annotate_poly(image, contours): + for cont in contours: + cv2.drawContours(image, cont, -1, (0, 255, 0), 4) return image diff --git a/table_parsing/table_parsig.py b/vidocp/table_parsig.py similarity index 100% rename from table_parsing/table_parsig.py rename to vidocp/table_parsig.py