Pull request #4: Restructuring and renaming of module
Merge in RR/vidocp from poly_to_rects_segmentation to master
Squashed commit of the following:
commit 3dffe067ef0bb4796eab22007eb6970b29f47822
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 16:10:28 2022 +0100
readme updated
commit 448517205259134a8427b48d86d0d5331b726487
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 16:09:35 2022 +0100
restructured dirs
commit 058c2971631c71d520b1a94ea75e249f9234ad87
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 15:57:08 2022 +0100
renaming
commit 4e64a3d07f1dad76775955639157ec7b60e6ad38
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 15:46:03 2022 +0100
readme updated
commit 728bedb13a2769b4652fd674ef26988efebcc7dc
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 15:33:42 2022 +0100
added DVC
commit e2d5594afd6683d8207007d3a85d178dd0a3e546
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sat Feb 5 14:49:09 2022 +0100
renaming
This commit is contained in:
parent
512d217b05
commit
3d4b924426
3
.dvc/.gitignore
vendored
Normal file
3
.dvc/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/config.local
|
||||
/tmp
|
||||
/cache
|
||||
7
.dvc/config
Normal file
7
.dvc/config
Normal file
@ -0,0 +1,7 @@
|
||||
[core]
|
||||
remote = vector
|
||||
autostage = true
|
||||
['remote "vector"']
|
||||
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
|
||||
port = 22
|
||||
|
||||
107
.dvc/plots/confusion.json
Normal file
107
.dvc/plots/confusion.json
Normal file
@ -0,0 +1,107 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"facet": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
},
|
||||
"spec": {
|
||||
"transform": [
|
||||
{
|
||||
"aggregate": [
|
||||
{
|
||||
"op": "count",
|
||||
"as": "xy_count"
|
||||
}
|
||||
],
|
||||
"groupby": [
|
||||
"<DVC_METRIC_Y>",
|
||||
"<DVC_METRIC_X>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_Y>"
|
||||
],
|
||||
"key": "<DVC_METRIC_X>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_X>"
|
||||
],
|
||||
"key": "<DVC_METRIC_Y>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"joinaggregate": [
|
||||
{
|
||||
"op": "max",
|
||||
"field": "xy_count",
|
||||
"as": "max_count"
|
||||
}
|
||||
],
|
||||
"groupby": []
|
||||
},
|
||||
{
|
||||
"calculate": "datum.xy_count / datum.max_count",
|
||||
"as": "percent_of_max"
|
||||
}
|
||||
],
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_Y_LABEL>"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "rect",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"encoding": {
|
||||
"color": {
|
||||
"field": "xy_count",
|
||||
"type": "quantitative",
|
||||
"title": "",
|
||||
"scale": {
|
||||
"domainMin": 0,
|
||||
"nice": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"mark": "text",
|
||||
"encoding": {
|
||||
"text": {
|
||||
"field": "xy_count",
|
||||
"type": "quantitative"
|
||||
},
|
||||
"color": {
|
||||
"condition": {
|
||||
"test": "datum.percent_of_max > 0.5",
|
||||
"value": "white"
|
||||
},
|
||||
"value": "black"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
112
.dvc/plots/confusion_normalized.json
Normal file
112
.dvc/plots/confusion_normalized.json
Normal file
@ -0,0 +1,112 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"facet": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
},
|
||||
"spec": {
|
||||
"transform": [
|
||||
{
|
||||
"aggregate": [
|
||||
{
|
||||
"op": "count",
|
||||
"as": "xy_count"
|
||||
}
|
||||
],
|
||||
"groupby": [
|
||||
"<DVC_METRIC_Y>",
|
||||
"<DVC_METRIC_X>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_Y>"
|
||||
],
|
||||
"key": "<DVC_METRIC_X>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_X>"
|
||||
],
|
||||
"key": "<DVC_METRIC_Y>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"joinaggregate": [
|
||||
{
|
||||
"op": "sum",
|
||||
"field": "xy_count",
|
||||
"as": "sum_y"
|
||||
}
|
||||
],
|
||||
"groupby": [
|
||||
"<DVC_METRIC_Y>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"calculate": "datum.xy_count / datum.sum_y",
|
||||
"as": "percent_of_y"
|
||||
}
|
||||
],
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_Y_LABEL>"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "rect",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"encoding": {
|
||||
"color": {
|
||||
"field": "percent_of_y",
|
||||
"type": "quantitative",
|
||||
"title": "",
|
||||
"scale": {
|
||||
"domain": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"mark": "text",
|
||||
"encoding": {
|
||||
"text": {
|
||||
"field": "percent_of_y",
|
||||
"type": "quantitative",
|
||||
"format": ".2f"
|
||||
},
|
||||
"color": {
|
||||
"condition": {
|
||||
"test": "datum.percent_of_y > 0.5",
|
||||
"value": "white"
|
||||
},
|
||||
"value": "black"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
116
.dvc/plots/linear.json
Normal file
116
.dvc/plots/linear.json
Normal file
@ -0,0 +1,116 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"layer": [
|
||||
{
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "line"
|
||||
},
|
||||
{
|
||||
"selection": {
|
||||
"label": {
|
||||
"type": "single",
|
||||
"nearest": true,
|
||||
"on": "mouseover",
|
||||
"encodings": [
|
||||
"x"
|
||||
],
|
||||
"empty": "none",
|
||||
"clear": "mouseout"
|
||||
}
|
||||
},
|
||||
"mark": "point",
|
||||
"encoding": {
|
||||
"opacity": {
|
||||
"condition": {
|
||||
"selection": "label",
|
||||
"value": 1
|
||||
},
|
||||
"value": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"transform": [
|
||||
{
|
||||
"filter": {
|
||||
"selection": "label"
|
||||
}
|
||||
}
|
||||
],
|
||||
"layer": [
|
||||
{
|
||||
"mark": {
|
||||
"type": "rule",
|
||||
"color": "gray"
|
||||
},
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"encoding": {
|
||||
"text": {
|
||||
"type": "quantitative",
|
||||
"field": "<DVC_METRIC_Y>"
|
||||
},
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": {
|
||||
"type": "text",
|
||||
"align": "left",
|
||||
"dx": 5,
|
||||
"dy": -5
|
||||
},
|
||||
"encoding": {
|
||||
"color": {
|
||||
"type": "nominal",
|
||||
"field": "rev"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
104
.dvc/plots/scatter.json
Normal file
104
.dvc/plots/scatter.json
Normal file
@ -0,0 +1,104 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"layer": [
|
||||
{
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "point"
|
||||
},
|
||||
{
|
||||
"selection": {
|
||||
"label": {
|
||||
"type": "single",
|
||||
"nearest": true,
|
||||
"on": "mouseover",
|
||||
"encodings": [
|
||||
"x"
|
||||
],
|
||||
"empty": "none",
|
||||
"clear": "mouseout"
|
||||
}
|
||||
},
|
||||
"mark": "point",
|
||||
"encoding": {
|
||||
"opacity": {
|
||||
"condition": {
|
||||
"selection": "label",
|
||||
"value": 1
|
||||
},
|
||||
"value": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"transform": [
|
||||
{
|
||||
"filter": {
|
||||
"selection": "label"
|
||||
}
|
||||
}
|
||||
],
|
||||
"layer": [
|
||||
{
|
||||
"encoding": {
|
||||
"text": {
|
||||
"type": "quantitative",
|
||||
"field": "<DVC_METRIC_Y>"
|
||||
},
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": {
|
||||
"type": "text",
|
||||
"align": "left",
|
||||
"dx": 5,
|
||||
"dy": -5
|
||||
},
|
||||
"encoding": {
|
||||
"color": {
|
||||
"type": "nominal",
|
||||
"field": "rev"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
31
.dvc/plots/simple.json
Normal file
31
.dvc/plots/simple.json
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"mark": {
|
||||
"type": "line"
|
||||
},
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
}
|
||||
}
|
||||
39
.dvc/plots/smooth.json
Normal file
39
.dvc/plots/smooth.json
Normal file
@ -0,0 +1,39 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"mark": {
|
||||
"type": "line"
|
||||
},
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
},
|
||||
"transform": [
|
||||
{
|
||||
"loess": "<DVC_METRIC_Y>",
|
||||
"on": "<DVC_METRIC_X>",
|
||||
"groupby": [
|
||||
"rev"
|
||||
],
|
||||
"bandwidth": 0.3
|
||||
}
|
||||
]
|
||||
}
|
||||
3
.dvcignore
Normal file
3
.dvcignore
Normal file
@ -0,0 +1,3 @@
|
||||
# Add patterns of files dvc should ignore, which could improve
|
||||
# the performance. Learn more at
|
||||
# https://dvc.org/doc/user-guide/dvcignore
|
||||
54
README.md
54
README.md
@ -1,13 +1,13 @@
|
||||
# Table Parsing
|
||||
# Vidocp
|
||||
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in PDFs.
|
||||
previous redactions in documents.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
git clone ssh://git@git.iqser.com:2222/rr/table_parsing.git
|
||||
cd table_parsing
|
||||
git clone ssh://git@git.iqser.com:2222/rr/vidocp.git
|
||||
cd vidocp
|
||||
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
@ -18,10 +18,48 @@ pip install -r requirements.txt
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Parse tables on second page of a PDF
|
||||
python scripts/annotate.py <path to pdf> 1 --type table
|
||||
### As an API
|
||||
|
||||
# Detect redactions (black filled rectangles) on first page of a PDF
|
||||
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
|
||||
the specific task. Example for finding the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
|
||||
from vidocp.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
### Example outputs from demo script:
|
||||
|
||||
|
||||
#### Table parsing
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
```bash
|
||||
python scripts/annotate.py <path to pdf> 1 --type table
|
||||
```
|
||||
|
||||
|
||||
#### Detect redactions
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
|
||||
```bash
|
||||
python scripts/annotate.py <path to pdf> 0 --type redaction
|
||||
```
|
||||
|
||||
The below image shows the detected redactions with green outlines.
|
||||
|
||||

|
||||
|
||||
1
data/.gitignore
vendored
Normal file
1
data/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/test_pdf.pdf
|
||||
BIN
data/redaction_detection.png
Normal file
BIN
data/redaction_detection.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.2 MiB |
4
data/test_pdf.pdf.dvc
Normal file
4
data/test_pdf.pdf.dvc
Normal file
@ -0,0 +1,4 @@
|
||||
outs:
|
||||
- md5: 60840305e4ddb084aea21976b8b7c49e
|
||||
size: 6916053
|
||||
path: test_pdf.pdf
|
||||
@ -4,3 +4,6 @@ pdf2image~=1.16.0
|
||||
matplotlib~=3.5.1
|
||||
imutils==0.5.4
|
||||
iteration-utilities==0.11.0
|
||||
dvc==2.9.3
|
||||
dvc[ssh]
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
import argparse
|
||||
|
||||
from table_parsing.table_parsig import annotate_tables_in_pdf
|
||||
from box_detection.redaction_detection import annotate_boxes_in_pdf
|
||||
from layout_detection.layout_detection import annotate_layout_in_pdf
|
||||
from vidocp.table_parsig import annotate_tables_in_pdf
|
||||
from vidocp.redaction_detection import annotate_boxes_in_pdf
|
||||
from vidocp.layout_detection import annotate_layout_in_pdf
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
||||
4
setup.py
4
setup.py
@ -3,11 +3,11 @@
|
||||
from distutils.core import setup
|
||||
|
||||
setup(
|
||||
name="table_parsing",
|
||||
name="vidocp",
|
||||
version="0.0.1",
|
||||
description="",
|
||||
author="",
|
||||
author_email="",
|
||||
url="",
|
||||
packages=["table_parsing"],
|
||||
packages=["vidocp"],
|
||||
)
|
||||
|
||||
@ -42,9 +42,9 @@ def find_redactions(image: np.array, min_normalized_area=200000):
|
||||
return contours
|
||||
|
||||
|
||||
def annotate_poly(image, conts):
|
||||
for cont in conts:
|
||||
cv2.drawContours(image, cont, -1, (0, 255, 0), 2)
|
||||
def annotate_poly(image, contours):
|
||||
for cont in contours:
|
||||
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
|
||||
|
||||
return image
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user