Pull request #4: Restructuring and renaming of module

Merge in RR/vidocp from poly_to_rects_segmentation to master

Squashed commit of the following:

commit 3dffe067ef0bb4796eab22007eb6970b29f47822
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 16:10:28 2022 +0100

    readme updated

commit 448517205259134a8427b48d86d0d5331b726487
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 16:09:35 2022 +0100

    restructured dirs

commit 058c2971631c71d520b1a94ea75e249f9234ad87
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 15:57:08 2022 +0100

    renaming

commit 4e64a3d07f1dad76775955639157ec7b60e6ad38
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 15:46:03 2022 +0100

    readme updated

commit 728bedb13a2769b4652fd674ef26988efebcc7dc
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 15:33:42 2022 +0100

    added DVC

commit e2d5594afd6683d8207007d3a85d178dd0a3e546
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Sat Feb 5 14:49:09 2022 +0100

    renaming
This commit is contained in:
Matthias Bisping 2022-02-05 16:14:24 +01:00
parent 512d217b05
commit 3d4b924426
21 changed files with 584 additions and 16 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

7
.dvc/config Normal file
View File

@ -0,0 +1,7 @@
[core]
remote = vector
autostage = true
['remote "vector"']
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
port = 22

107
.dvc/plots/confusion.json Normal file
View File

@ -0,0 +1,107 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"facet": {
"field": "rev",
"type": "nominal"
},
"spec": {
"transform": [
{
"aggregate": [
{
"op": "count",
"as": "xy_count"
}
],
"groupby": [
"<DVC_METRIC_Y>",
"<DVC_METRIC_X>"
]
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_Y>"
],
"key": "<DVC_METRIC_X>",
"value": 0
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_X>"
],
"key": "<DVC_METRIC_Y>",
"value": 0
},
{
"joinaggregate": [
{
"op": "max",
"field": "xy_count",
"as": "max_count"
}
],
"groupby": []
},
{
"calculate": "datum.xy_count / datum.max_count",
"as": "percent_of_max"
}
],
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_Y_LABEL>"
}
},
"layer": [
{
"mark": "rect",
"width": 300,
"height": 300,
"encoding": {
"color": {
"field": "xy_count",
"type": "quantitative",
"title": "",
"scale": {
"domainMin": 0,
"nice": true
}
}
}
},
{
"mark": "text",
"encoding": {
"text": {
"field": "xy_count",
"type": "quantitative"
},
"color": {
"condition": {
"test": "datum.percent_of_max > 0.5",
"value": "white"
},
"value": "black"
}
}
}
]
}
}

View File

@ -0,0 +1,112 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"facet": {
"field": "rev",
"type": "nominal"
},
"spec": {
"transform": [
{
"aggregate": [
{
"op": "count",
"as": "xy_count"
}
],
"groupby": [
"<DVC_METRIC_Y>",
"<DVC_METRIC_X>"
]
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_Y>"
],
"key": "<DVC_METRIC_X>",
"value": 0
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_X>"
],
"key": "<DVC_METRIC_Y>",
"value": 0
},
{
"joinaggregate": [
{
"op": "sum",
"field": "xy_count",
"as": "sum_y"
}
],
"groupby": [
"<DVC_METRIC_Y>"
]
},
{
"calculate": "datum.xy_count / datum.sum_y",
"as": "percent_of_y"
}
],
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_Y_LABEL>"
}
},
"layer": [
{
"mark": "rect",
"width": 300,
"height": 300,
"encoding": {
"color": {
"field": "percent_of_y",
"type": "quantitative",
"title": "",
"scale": {
"domain": [
0,
1
]
}
}
}
},
{
"mark": "text",
"encoding": {
"text": {
"field": "percent_of_y",
"type": "quantitative",
"format": ".2f"
},
"color": {
"condition": {
"test": "datum.percent_of_y > 0.5",
"value": "white"
},
"value": "black"
}
}
}
]
}
}

116
.dvc/plots/linear.json Normal file
View File

@ -0,0 +1,116 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"width": 300,
"height": 300,
"layer": [
{
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"layer": [
{
"mark": "line"
},
{
"selection": {
"label": {
"type": "single",
"nearest": true,
"on": "mouseover",
"encodings": [
"x"
],
"empty": "none",
"clear": "mouseout"
}
},
"mark": "point",
"encoding": {
"opacity": {
"condition": {
"selection": "label",
"value": 1
},
"value": 0
}
}
}
]
},
{
"transform": [
{
"filter": {
"selection": "label"
}
}
],
"layer": [
{
"mark": {
"type": "rule",
"color": "gray"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative"
}
}
},
{
"encoding": {
"text": {
"type": "quantitative",
"field": "<DVC_METRIC_Y>"
},
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative"
}
},
"layer": [
{
"mark": {
"type": "text",
"align": "left",
"dx": 5,
"dy": -5
},
"encoding": {
"color": {
"type": "nominal",
"field": "rev"
}
}
}
]
}
]
}
]
}

104
.dvc/plots/scatter.json Normal file
View File

@ -0,0 +1,104 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"width": 300,
"height": 300,
"layer": [
{
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"layer": [
{
"mark": "point"
},
{
"selection": {
"label": {
"type": "single",
"nearest": true,
"on": "mouseover",
"encodings": [
"x"
],
"empty": "none",
"clear": "mouseout"
}
},
"mark": "point",
"encoding": {
"opacity": {
"condition": {
"selection": "label",
"value": 1
},
"value": 0
}
}
}
]
},
{
"transform": [
{
"filter": {
"selection": "label"
}
}
],
"layer": [
{
"encoding": {
"text": {
"type": "quantitative",
"field": "<DVC_METRIC_Y>"
},
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative"
}
},
"layer": [
{
"mark": {
"type": "text",
"align": "left",
"dx": 5,
"dy": -5
},
"encoding": {
"color": {
"type": "nominal",
"field": "rev"
}
}
}
]
}
]
}
]
}

31
.dvc/plots/simple.json Normal file
View File

@ -0,0 +1,31 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"width": 300,
"height": 300,
"mark": {
"type": "line"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
}
}

39
.dvc/plots/smooth.json Normal file
View File

@ -0,0 +1,39 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"mark": {
"type": "line"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"transform": [
{
"loess": "<DVC_METRIC_Y>",
"on": "<DVC_METRIC_X>",
"groupby": [
"rev"
],
"bandwidth": 0.3
}
]
}

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

View File

@ -1,13 +1,13 @@
# Table Parsing
# Vidocp
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in PDFs.
previous redactions in documents.
## Installation
```bash
git clone ssh://git@git.iqser.com:2222/rr/table_parsing.git
cd table_parsing
git clone ssh://git@git.iqser.com:2222/rr/vidocp.git
cd vidocp
python -m venv env
source env/bin/activate
@ -18,10 +18,48 @@ pip install -r requirements.txt
## Usage
```bash
# Parse tables on second page of a PDF
python scripts/annotate.py <path to pdf> 1 --type table
### As an API
# Detect redactions (black filled rectangles) on first page of a PDF
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
the specific task. Example for finding the outlines of previous redactions.
```python
from vidocp.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
```
### Example outputs from demo script:
#### Table parsing
The tables parsing utility detects and segments tables into individual cells.
```bash
python scripts/annotate.py <path to pdf> 1 --type table
```
#### Detect redactions
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
```bash
python scripts/annotate.py <path to pdf> 0 --type redaction
```
The below image shows the detected redactions with green outlines.
![](data/redaction_detection.png)

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/test_pdf.pdf

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 MiB

4
data/test_pdf.pdf.dvc Normal file
View File

@ -0,0 +1,4 @@
outs:
- md5: 60840305e4ddb084aea21976b8b7c49e
size: 6916053
path: test_pdf.pdf

View File

@ -4,3 +4,6 @@ pdf2image~=1.16.0
matplotlib~=3.5.1
imutils==0.5.4
iteration-utilities==0.11.0
dvc==2.9.3
dvc[ssh]

View File

@ -1,8 +1,8 @@
import argparse
from table_parsing.table_parsig import annotate_tables_in_pdf
from box_detection.redaction_detection import annotate_boxes_in_pdf
from layout_detection.layout_detection import annotate_layout_in_pdf
from vidocp.table_parsig import annotate_tables_in_pdf
from vidocp.redaction_detection import annotate_boxes_in_pdf
from vidocp.layout_detection import annotate_layout_in_pdf
def parse_args():

View File

@ -3,11 +3,11 @@
from distutils.core import setup
setup(
name="table_parsing",
name="vidocp",
version="0.0.1",
description="",
author="",
author_email="",
url="",
packages=["table_parsing"],
packages=["vidocp"],
)

View File

@ -42,9 +42,9 @@ def find_redactions(image: np.array, min_normalized_area=200000):
return contours
def annotate_poly(image, conts):
for cont in conts:
cv2.drawContours(image, cont, -1, (0, 255, 0), 2)
def annotate_poly(image, contours):
for cont in contours:
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
return image