Pull request #4: Restructuring and renaming of module

Merge in RR/vidocp from poly_to_rects_segmentation to master Squashed commit of the following: commit 3dffe067ef0bb4796eab22007eb6970b29f47822 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 16:10:28 2022 +0100 readme updated commit 448517205259134a8427b48d86d0d5331b726487 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 16:09:35 2022 +0100 restructured dirs commit 058c2971631c71d520b1a94ea75e249f9234ad87 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 15:57:08 2022 +0100 renaming commit 4e64a3d07f1dad76775955639157ec7b60e6ad38 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 15:46:03 2022 +0100 readme updated commit 728bedb13a2769b4652fd674ef26988efebcc7dc Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 15:33:42 2022 +0100 added DVC commit e2d5594afd6683d8207007d3a85d178dd0a3e546 Author: Matthias Bisping <matthias.bisping@iqser.com> Date: Sat Feb 5 14:49:09 2022 +0100 renaming
2022-02-05 16:14:24 +01:00 · 2022-02-05 16:14:24 +01:00 · 3d4b924426
commit 3d4b924426
parent 512d217b05
21 changed files with 584 additions and 16 deletions
--- a/.dvc/.gitignore
+++ b/.dvc/.gitignore
@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
--- a/.dvc/config
+++ b/.dvc/config
@ -0,0 +1,7 @@
+[core]
+    remote = vector
+    autostage = true
+['remote "vector"']
+    url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
+    port = 22
+
--- a/.dvc/plots/confusion.json
+++ b/.dvc/plots/confusion.json
@ -0,0 +1,107 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "facet": {
+        "field": "rev",
+        "type": "nominal"
+    },
+    "spec": {
+        "transform": [
+            {
+                "aggregate": [
+                    {
+                        "op": "count",
+                        "as": "xy_count"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>",
+                    "<DVC_METRIC_X>"
+                ]
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_Y>"
+                ],
+                "key": "<DVC_METRIC_X>",
+                "value": 0
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_X>"
+                ],
+                "key": "<DVC_METRIC_Y>",
+                "value": 0
+            },
+            {
+                "joinaggregate": [
+                    {
+                        "op": "max",
+                        "field": "xy_count",
+                        "as": "max_count"
+                    }
+                ],
+                "groupby": []
+            },
+            {
+                "calculate": "datum.xy_count / datum.max_count",
+                "as": "percent_of_max"
+            }
+        ],
+        "encoding": {
+            "x": {
+                "field": "<DVC_METRIC_X>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_X_LABEL>"
+            },
+            "y": {
+                "field": "<DVC_METRIC_Y>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_Y_LABEL>"
+            }
+        },
+        "layer": [
+            {
+                "mark": "rect",
+                "width": 300,
+                "height": 300,
+                "encoding": {
+                    "color": {
+                        "field": "xy_count",
+                        "type": "quantitative",
+                        "title": "",
+                        "scale": {
+                            "domainMin": 0,
+                            "nice": true
+                        }
+                    }
+                }
+            },
+            {
+                "mark": "text",
+                "encoding": {
+                    "text": {
+                        "field": "xy_count",
+                        "type": "quantitative"
+                    },
+                    "color": {
+                        "condition": {
+                            "test": "datum.percent_of_max > 0.5",
+                            "value": "white"
+                        },
+                        "value": "black"
+                    }
+                }
+            }
+        ]
+    }
+}
--- a/.dvc/plots/confusion_normalized.json
+++ b/.dvc/plots/confusion_normalized.json
@ -0,0 +1,112 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "facet": {
+        "field": "rev",
+        "type": "nominal"
+    },
+    "spec": {
+        "transform": [
+            {
+                "aggregate": [
+                    {
+                        "op": "count",
+                        "as": "xy_count"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>",
+                    "<DVC_METRIC_X>"
+                ]
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_Y>"
+                ],
+                "key": "<DVC_METRIC_X>",
+                "value": 0
+            },
+            {
+                "impute": "xy_count",
+                "groupby": [
+                    "rev",
+                    "<DVC_METRIC_X>"
+                ],
+                "key": "<DVC_METRIC_Y>",
+                "value": 0
+            },
+            {
+                "joinaggregate": [
+                    {
+                        "op": "sum",
+                        "field": "xy_count",
+                        "as": "sum_y"
+                    }
+                ],
+                "groupby": [
+                    "<DVC_METRIC_Y>"
+                ]
+            },
+            {
+                "calculate": "datum.xy_count / datum.sum_y",
+                "as": "percent_of_y"
+            }
+        ],
+        "encoding": {
+            "x": {
+                "field": "<DVC_METRIC_X>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_X_LABEL>"
+            },
+            "y": {
+                "field": "<DVC_METRIC_Y>",
+                "type": "nominal",
+                "sort": "ascending",
+                "title": "<DVC_METRIC_Y_LABEL>"
+            }
+        },
+        "layer": [
+            {
+                "mark": "rect",
+                "width": 300,
+                "height": 300,
+                "encoding": {
+                    "color": {
+                        "field": "percent_of_y",
+                        "type": "quantitative",
+                        "title": "",
+                        "scale": {
+                            "domain": [
+                                0,
+                                1
+                            ]
+                        }
+                    }
+                }
+            },
+            {
+                "mark": "text",
+                "encoding": {
+                    "text": {
+                        "field": "percent_of_y",
+                        "type": "quantitative",
+                        "format": ".2f"
+                    },
+                    "color": {
+                        "condition": {
+                            "test": "datum.percent_of_y > 0.5",
+                            "value": "white"
+                        },
+                        "value": "black"
+                    }
+                }
+            }
+        ]
+    }
+}
--- a/.dvc/plots/linear.json
+++ b/.dvc/plots/linear.json
@ -0,0 +1,116 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "layer": [
+        {
+            "encoding": {
+                "x": {
+                    "field": "<DVC_METRIC_X>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_X_LABEL>"
+                },
+                "y": {
+                    "field": "<DVC_METRIC_Y>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_Y_LABEL>",
+                    "scale": {
+                        "zero": false
+                    }
+                },
+                "color": {
+                    "field": "rev",
+                    "type": "nominal"
+                }
+            },
+            "layer": [
+                {
+                    "mark": "line"
+                },
+                {
+                    "selection": {
+                        "label": {
+                            "type": "single",
+                            "nearest": true,
+                            "on": "mouseover",
+                            "encodings": [
+                                "x"
+                            ],
+                            "empty": "none",
+                            "clear": "mouseout"
+                        }
+                    },
+                    "mark": "point",
+                    "encoding": {
+                        "opacity": {
+                            "condition": {
+                                "selection": "label",
+                                "value": 1
+                            },
+                            "value": 0
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "transform": [
+                {
+                    "filter": {
+                        "selection": "label"
+                    }
+                }
+            ],
+            "layer": [
+                {
+                    "mark": {
+                        "type": "rule",
+                        "color": "gray"
+                    },
+                    "encoding": {
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        }
+                    }
+                },
+                {
+                    "encoding": {
+                        "text": {
+                            "type": "quantitative",
+                            "field": "<DVC_METRIC_Y>"
+                        },
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        },
+                        "y": {
+                            "field": "<DVC_METRIC_Y>",
+                            "type": "quantitative"
+                        }
+                    },
+                    "layer": [
+                        {
+                            "mark": {
+                                "type": "text",
+                                "align": "left",
+                                "dx": 5,
+                                "dy": -5
+                            },
+                            "encoding": {
+                                "color": {
+                                    "type": "nominal",
+                                    "field": "rev"
+                                }
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
--- a/.dvc/plots/scatter.json
+++ b/.dvc/plots/scatter.json
@ -0,0 +1,104 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "layer": [
+        {
+            "encoding": {
+                "x": {
+                    "field": "<DVC_METRIC_X>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_X_LABEL>"
+                },
+                "y": {
+                    "field": "<DVC_METRIC_Y>",
+                    "type": "quantitative",
+                    "title": "<DVC_METRIC_Y_LABEL>",
+                    "scale": {
+                        "zero": false
+                    }
+                },
+                "color": {
+                    "field": "rev",
+                    "type": "nominal"
+                }
+            },
+            "layer": [
+                {
+                    "mark": "point"
+                },
+                {
+                    "selection": {
+                        "label": {
+                            "type": "single",
+                            "nearest": true,
+                            "on": "mouseover",
+                            "encodings": [
+                                "x"
+                            ],
+                            "empty": "none",
+                            "clear": "mouseout"
+                        }
+                    },
+                    "mark": "point",
+                    "encoding": {
+                        "opacity": {
+                            "condition": {
+                                "selection": "label",
+                                "value": 1
+                            },
+                            "value": 0
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "transform": [
+                {
+                    "filter": {
+                        "selection": "label"
+                    }
+                }
+            ],
+            "layer": [
+                {
+                    "encoding": {
+                        "text": {
+                            "type": "quantitative",
+                            "field": "<DVC_METRIC_Y>"
+                        },
+                        "x": {
+                            "field": "<DVC_METRIC_X>",
+                            "type": "quantitative"
+                        },
+                        "y": {
+                            "field": "<DVC_METRIC_Y>",
+                            "type": "quantitative"
+                        }
+                    },
+                    "layer": [
+                        {
+                            "mark": {
+                                "type": "text",
+                                "align": "left",
+                                "dx": 5,
+                                "dy": -5
+                            },
+                            "encoding": {
+                                "color": {
+                                    "type": "nominal",
+                                    "field": "rev"
+                                }
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
--- a/.dvc/plots/simple.json
+++ b/.dvc/plots/simple.json
@ -0,0 +1,31 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "width": 300,
+    "height": 300,
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    }
+}
--- a/.dvc/plots/smooth.json
+++ b/.dvc/plots/smooth.json
@ -0,0 +1,39 @@
+{
+    "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
+    "data": {
+        "values": "<DVC_METRIC_DATA>"
+    },
+    "title": "<DVC_METRIC_TITLE>",
+    "mark": {
+        "type": "line"
+    },
+    "encoding": {
+        "x": {
+            "field": "<DVC_METRIC_X>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_X_LABEL>"
+        },
+        "y": {
+            "field": "<DVC_METRIC_Y>",
+            "type": "quantitative",
+            "title": "<DVC_METRIC_Y_LABEL>",
+            "scale": {
+                "zero": false
+            }
+        },
+        "color": {
+            "field": "rev",
+            "type": "nominal"
+        }
+    },
+    "transform": [
+        {
+            "loess": "<DVC_METRIC_Y>",
+            "on": "<DVC_METRIC_X>",
+            "groupby": [
+                "rev"
+            ],
+            "bandwidth": 0.3
+        }
+    ]
+}
--- a/.dvcignore
+++ b/.dvcignore
@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
--- a/README.md
+++ b/README.md
@ -1,13 +1,13 @@
-# Table Parsing
+# Vidocp

 This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
-previous redactions in PDFs.
+previous redactions in documents.

 ## Installation

 ```bash
-git clone ssh://git@git.iqser.com:2222/rr/table_parsing.git
-cd table_parsing
+git clone ssh://git@git.iqser.com:2222/rr/vidocp.git
+cd vidocp

 python -m venv env
 source env/bin/activate
@ -18,10 +18,48 @@ pip install -r requirements.txt

 ## Usage

-```bash
-# Parse tables on second page of a PDF
-python scripts/annotate.py <path to pdf> 1 --type table
+### As an API

-# Detect redactions (black filled rectangles) on first page of a PDF
+The module provided functions for the individual tasks that all return some kid of collection of points, depending on
+the specific task. Example for finding the outlines of previous redactions.
+
+```python
+
+from vidocp.redaction_detection import find_redactions
+import pdf2image 
+import numpy as np
+
+pdf_path = ...
+page_index = ...
+
+
+page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
+page = np.array(page)
+
+redaction_contours = find_redactions(page)
+```
+
+
+
+
+### Example outputs from demo script:
+
+
+#### Table parsing
+
+The tables parsing utility detects and segments tables into individual cells.
+```bash
+python scripts/annotate.py <path to pdf> 1 --type table
+```
+
+
+#### Detect redactions
+
+The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
+```bash
 python scripts/annotate.py <path to pdf> 0 --type redaction
 ```
+
+The below image shows the detected redactions with green outlines.
+
+![](data/redaction_detection.png)
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
+/test_pdf.pdf
--- a/data/redaction_detection.png
+++ b/data/redaction_detection.png
--- a/data/test_pdf.pdf.dvc
+++ b/data/test_pdf.pdf.dvc
@ -0,0 +1,4 @@
+outs:
+- md5: 60840305e4ddb084aea21976b8b7c49e
+  size: 6916053
+  path: test_pdf.pdf
--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,6 @@ pdf2image~=1.16.0
 matplotlib~=3.5.1
 imutils==0.5.4
 iteration-utilities==0.11.0
+dvc==2.9.3
+dvc[ssh]
+
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@ -1,8 +1,8 @@
 import argparse

-from table_parsing.table_parsig import annotate_tables_in_pdf
-from box_detection.redaction_detection import annotate_boxes_in_pdf
-from layout_detection.layout_detection import annotate_layout_in_pdf
+from vidocp.table_parsig import annotate_tables_in_pdf
+from vidocp.redaction_detection import annotate_boxes_in_pdf
+from vidocp.layout_detection import annotate_layout_in_pdf


 def parse_args():
--- a/setup.py
+++ b/setup.py
@ -3,11 +3,11 @@
 from distutils.core import setup

 setup(
-    name="table_parsing",
+    name="vidocp",
    version="0.0.1",
    description="",
    author="",
    author_email="",
    url="",
-    packages=["table_parsing"],
+    packages=["vidocp"],
 )
--- a/table_parsing/init.py
+++ b/table_parsing/init.py
--- a/box_detection/init.py
+++ b/box_detection/init.py
--- a/layout_detection/layout_detection.py
+++ b/layout_detection/layout_detection.py
--- a/box_detection/redaction_detection.py
+++ b/box_detection/redaction_detection.py
@ -42,9 +42,9 @@ def find_redactions(image: np.array, min_normalized_area=200000):
    return contours


-def annotate_poly(image, conts):
-    for cont in conts:
-        cv2.drawContours(image, cont, -1, (0, 255, 0), 2)
+def annotate_poly(image, contours):
+    for cont in contours:
+        cv2.drawContours(image, cont, -1, (0, 255, 0), 4)

    return image

--- a/table_parsing/table_parsig.py
+++ b/table_parsing/table_parsig.py