Merge branch 'table_parsing_version_2' of ssh://git.iqser.com:2222/rr/table_parsing into uncommon-tables
Conflicts: requirements.txt
This commit is contained in:
commit
17b8e3a16e
3
.dvc/.gitignore
vendored
Normal file
3
.dvc/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/config.local
|
||||
/tmp
|
||||
/cache
|
||||
7
.dvc/config
Normal file
7
.dvc/config
Normal file
@ -0,0 +1,7 @@
|
||||
[core]
|
||||
remote = vector
|
||||
autostage = true
|
||||
['remote "vector"']
|
||||
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
|
||||
port = 22
|
||||
|
||||
107
.dvc/plots/confusion.json
Normal file
107
.dvc/plots/confusion.json
Normal file
@ -0,0 +1,107 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"facet": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
},
|
||||
"spec": {
|
||||
"transform": [
|
||||
{
|
||||
"aggregate": [
|
||||
{
|
||||
"op": "count",
|
||||
"as": "xy_count"
|
||||
}
|
||||
],
|
||||
"groupby": [
|
||||
"<DVC_METRIC_Y>",
|
||||
"<DVC_METRIC_X>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_Y>"
|
||||
],
|
||||
"key": "<DVC_METRIC_X>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_X>"
|
||||
],
|
||||
"key": "<DVC_METRIC_Y>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"joinaggregate": [
|
||||
{
|
||||
"op": "max",
|
||||
"field": "xy_count",
|
||||
"as": "max_count"
|
||||
}
|
||||
],
|
||||
"groupby": []
|
||||
},
|
||||
{
|
||||
"calculate": "datum.xy_count / datum.max_count",
|
||||
"as": "percent_of_max"
|
||||
}
|
||||
],
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_Y_LABEL>"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "rect",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"encoding": {
|
||||
"color": {
|
||||
"field": "xy_count",
|
||||
"type": "quantitative",
|
||||
"title": "",
|
||||
"scale": {
|
||||
"domainMin": 0,
|
||||
"nice": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"mark": "text",
|
||||
"encoding": {
|
||||
"text": {
|
||||
"field": "xy_count",
|
||||
"type": "quantitative"
|
||||
},
|
||||
"color": {
|
||||
"condition": {
|
||||
"test": "datum.percent_of_max > 0.5",
|
||||
"value": "white"
|
||||
},
|
||||
"value": "black"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
112
.dvc/plots/confusion_normalized.json
Normal file
112
.dvc/plots/confusion_normalized.json
Normal file
@ -0,0 +1,112 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"facet": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
},
|
||||
"spec": {
|
||||
"transform": [
|
||||
{
|
||||
"aggregate": [
|
||||
{
|
||||
"op": "count",
|
||||
"as": "xy_count"
|
||||
}
|
||||
],
|
||||
"groupby": [
|
||||
"<DVC_METRIC_Y>",
|
||||
"<DVC_METRIC_X>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_Y>"
|
||||
],
|
||||
"key": "<DVC_METRIC_X>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"impute": "xy_count",
|
||||
"groupby": [
|
||||
"rev",
|
||||
"<DVC_METRIC_X>"
|
||||
],
|
||||
"key": "<DVC_METRIC_Y>",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"joinaggregate": [
|
||||
{
|
||||
"op": "sum",
|
||||
"field": "xy_count",
|
||||
"as": "sum_y"
|
||||
}
|
||||
],
|
||||
"groupby": [
|
||||
"<DVC_METRIC_Y>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"calculate": "datum.xy_count / datum.sum_y",
|
||||
"as": "percent_of_y"
|
||||
}
|
||||
],
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "nominal",
|
||||
"sort": "ascending",
|
||||
"title": "<DVC_METRIC_Y_LABEL>"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "rect",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"encoding": {
|
||||
"color": {
|
||||
"field": "percent_of_y",
|
||||
"type": "quantitative",
|
||||
"title": "",
|
||||
"scale": {
|
||||
"domain": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"mark": "text",
|
||||
"encoding": {
|
||||
"text": {
|
||||
"field": "percent_of_y",
|
||||
"type": "quantitative",
|
||||
"format": ".2f"
|
||||
},
|
||||
"color": {
|
||||
"condition": {
|
||||
"test": "datum.percent_of_y > 0.5",
|
||||
"value": "white"
|
||||
},
|
||||
"value": "black"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
116
.dvc/plots/linear.json
Normal file
116
.dvc/plots/linear.json
Normal file
@ -0,0 +1,116 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"layer": [
|
||||
{
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "line"
|
||||
},
|
||||
{
|
||||
"selection": {
|
||||
"label": {
|
||||
"type": "single",
|
||||
"nearest": true,
|
||||
"on": "mouseover",
|
||||
"encodings": [
|
||||
"x"
|
||||
],
|
||||
"empty": "none",
|
||||
"clear": "mouseout"
|
||||
}
|
||||
},
|
||||
"mark": "point",
|
||||
"encoding": {
|
||||
"opacity": {
|
||||
"condition": {
|
||||
"selection": "label",
|
||||
"value": 1
|
||||
},
|
||||
"value": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"transform": [
|
||||
{
|
||||
"filter": {
|
||||
"selection": "label"
|
||||
}
|
||||
}
|
||||
],
|
||||
"layer": [
|
||||
{
|
||||
"mark": {
|
||||
"type": "rule",
|
||||
"color": "gray"
|
||||
},
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"encoding": {
|
||||
"text": {
|
||||
"type": "quantitative",
|
||||
"field": "<DVC_METRIC_Y>"
|
||||
},
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": {
|
||||
"type": "text",
|
||||
"align": "left",
|
||||
"dx": 5,
|
||||
"dy": -5
|
||||
},
|
||||
"encoding": {
|
||||
"color": {
|
||||
"type": "nominal",
|
||||
"field": "rev"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
104
.dvc/plots/scatter.json
Normal file
104
.dvc/plots/scatter.json
Normal file
@ -0,0 +1,104 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"layer": [
|
||||
{
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": "point"
|
||||
},
|
||||
{
|
||||
"selection": {
|
||||
"label": {
|
||||
"type": "single",
|
||||
"nearest": true,
|
||||
"on": "mouseover",
|
||||
"encodings": [
|
||||
"x"
|
||||
],
|
||||
"empty": "none",
|
||||
"clear": "mouseout"
|
||||
}
|
||||
},
|
||||
"mark": "point",
|
||||
"encoding": {
|
||||
"opacity": {
|
||||
"condition": {
|
||||
"selection": "label",
|
||||
"value": 1
|
||||
},
|
||||
"value": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"transform": [
|
||||
{
|
||||
"filter": {
|
||||
"selection": "label"
|
||||
}
|
||||
}
|
||||
],
|
||||
"layer": [
|
||||
{
|
||||
"encoding": {
|
||||
"text": {
|
||||
"type": "quantitative",
|
||||
"field": "<DVC_METRIC_Y>"
|
||||
},
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative"
|
||||
}
|
||||
},
|
||||
"layer": [
|
||||
{
|
||||
"mark": {
|
||||
"type": "text",
|
||||
"align": "left",
|
||||
"dx": 5,
|
||||
"dy": -5
|
||||
},
|
||||
"encoding": {
|
||||
"color": {
|
||||
"type": "nominal",
|
||||
"field": "rev"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
31
.dvc/plots/simple.json
Normal file
31
.dvc/plots/simple.json
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"width": 300,
|
||||
"height": 300,
|
||||
"mark": {
|
||||
"type": "line"
|
||||
},
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
}
|
||||
}
|
||||
39
.dvc/plots/smooth.json
Normal file
39
.dvc/plots/smooth.json
Normal file
@ -0,0 +1,39 @@
|
||||
{
|
||||
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
|
||||
"data": {
|
||||
"values": "<DVC_METRIC_DATA>"
|
||||
},
|
||||
"title": "<DVC_METRIC_TITLE>",
|
||||
"mark": {
|
||||
"type": "line"
|
||||
},
|
||||
"encoding": {
|
||||
"x": {
|
||||
"field": "<DVC_METRIC_X>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_X_LABEL>"
|
||||
},
|
||||
"y": {
|
||||
"field": "<DVC_METRIC_Y>",
|
||||
"type": "quantitative",
|
||||
"title": "<DVC_METRIC_Y_LABEL>",
|
||||
"scale": {
|
||||
"zero": false
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"field": "rev",
|
||||
"type": "nominal"
|
||||
}
|
||||
},
|
||||
"transform": [
|
||||
{
|
||||
"loess": "<DVC_METRIC_Y>",
|
||||
"on": "<DVC_METRIC_X>",
|
||||
"groupby": [
|
||||
"rev"
|
||||
],
|
||||
"bandwidth": 0.3
|
||||
}
|
||||
]
|
||||
}
|
||||
3
.dvcignore
Normal file
3
.dvcignore
Normal file
@ -0,0 +1,3 @@
|
||||
# Add patterns of files dvc should ignore, which could improve
|
||||
# the performance. Learn more at
|
||||
# https://dvc.org/doc/user-guide/dvcignore
|
||||
68
README.md
Normal file
68
README.md
Normal file
@ -0,0 +1,68 @@
|
||||
# Vidocp — Visual Document Parsing
|
||||
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
git clone ssh://git@git.iqser.com:2222/rr/vidocp.git
|
||||
cd vidocp
|
||||
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
dvc pull
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### As an API
|
||||
|
||||
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
|
||||
the specific task. Example for finding the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
|
||||
from vidocp.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
```
|
||||
|
||||
|
||||
### As a CLI Tool
|
||||
|
||||
|
||||
Core API functionalities can be used through a CLI.
|
||||
|
||||
|
||||
#### Table Parsing
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||
```
|
||||
|
||||
|
||||
#### Redaction Detection
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
|
||||
```bash
|
||||
python scripts/annotate.py <path to pdf> 0 --type redaction
|
||||
```
|
||||
|
||||
The below image shows the detected redactions with green outlines.
|
||||
|
||||

|
||||
1
data/.gitignore
vendored
Normal file
1
data/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/test_pdf.pdf
|
||||
BIN
data/redaction_detection.png
Normal file
BIN
data/redaction_detection.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.2 MiB |
4
data/test_pdf.pdf.dvc
Normal file
4
data/test_pdf.pdf.dvc
Normal file
@ -0,0 +1,4 @@
|
||||
outs:
|
||||
- md5: 60840305e4ddb084aea21976b8b7c49e
|
||||
size: 6916053
|
||||
path: test_pdf.pdf
|
||||
@ -2,4 +2,8 @@ opencv-python~=4.5.5.62
|
||||
numpy~=1.22.1
|
||||
pdf2image~=1.16.0
|
||||
matplotlib~=3.5.1
|
||||
imutils~=0.5.4
|
||||
imutils==0.5.4
|
||||
iteration-utilities==0.11.0
|
||||
dvc==2.9.3
|
||||
dvc[ssh]
|
||||
|
||||
|
||||
26
scripts/annotate.py
Normal file
26
scripts/annotate.py
Normal file
@ -0,0 +1,26 @@
|
||||
import argparse
|
||||
|
||||
from vidocp.table_parsing_2 import annotate_tables_in_pdf
|
||||
from vidocp.redaction_detection import annotate_boxes_in_pdf
|
||||
from vidocp.layout_detection import annotate_layout_in_pdf
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("pdf_path")
|
||||
parser.add_argument("page_index", type=int)
|
||||
parser.add_argument("--type", choices=["table", "redaction", "layout"], default="table")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
if args.type == "table":
|
||||
annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
|
||||
elif args.type == "redaction":
|
||||
annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index)
|
||||
elif args.type == "layout":
|
||||
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index)
|
||||
@ -1,18 +0,0 @@
|
||||
import argparse
|
||||
|
||||
from table_parsing.table_parsig import annotate_tables_in_pdf
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("pdf_path")
|
||||
parser.add_argument("page_index", type=int)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
|
||||
4
setup.py
4
setup.py
@ -3,11 +3,11 @@
|
||||
from distutils.core import setup
|
||||
|
||||
setup(
|
||||
name="table_parsing",
|
||||
name="vidocp",
|
||||
version="0.0.1",
|
||||
description="",
|
||||
author="",
|
||||
author_email="",
|
||||
url="",
|
||||
packages=["table_parsing"],
|
||||
packages=["vidocp"],
|
||||
)
|
||||
|
||||
53
vidocp/layout_detection.py
Normal file
53
vidocp/layout_detection.py
Normal file
@ -0,0 +1,53 @@
|
||||
from itertools import count
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pdf2image
|
||||
from matplotlib import pyplot as plt
|
||||
import imutils
|
||||
|
||||
|
||||
def find_layout_boxes(image: np.array):
|
||||
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1)
|
||||
thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1]
|
||||
img_bin = ~thresh
|
||||
|
||||
line_min_width = 10
|
||||
kernel_h = np.ones((10, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 10), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
|
||||
contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = imutils.grab_contours(contours)
|
||||
for c in contours:
|
||||
peri = cv2.arcLength(c, True)
|
||||
approx = cv2.approxPolyDP(c, 0.04 * peri, True)
|
||||
yield cv2.boundingRect(approx)
|
||||
|
||||
|
||||
def annotate_layout_boxes(image, rects):
|
||||
for rect in rects:
|
||||
(x, y, w, h) = rect
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def annotate_layout_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
layout_boxes = find_layout_boxes(page)
|
||||
page = annotate_layout_boxes(page, layout_boxes)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
63
vidocp/redaction_detection.py
Normal file
63
vidocp/redaction_detection.py
Normal file
@ -0,0 +1,63 @@
|
||||
from functools import partial
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pdf2image
|
||||
from iteration_utilities import starfilter, first
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
|
||||
def is_filled(hierarchy):
|
||||
# See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
|
||||
return hierarchy[3] <= 0 and hierarchy[2] == -1
|
||||
|
||||
|
||||
def is_boxy(contour):
|
||||
epsilon = 0.01 * cv2.arcLength(contour, True)
|
||||
approx = cv2.approxPolyDP(contour, epsilon, True)
|
||||
return len(approx) <= 10
|
||||
|
||||
|
||||
def is_large_enough(contour, min_area):
|
||||
return cv2.contourArea(contour, False) > min_area
|
||||
|
||||
|
||||
def is_likely_redaction(contour, hierarchy, min_area):
|
||||
return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
|
||||
|
||||
|
||||
def find_redactions(image: np.array, min_normalized_area=200000):
|
||||
|
||||
min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution
|
||||
|
||||
gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
blurred = cv2.GaussianBlur(gray, (5, 5), 1)
|
||||
thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]
|
||||
|
||||
contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
|
||||
|
||||
contours = map(
|
||||
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
|
||||
)
|
||||
return contours
|
||||
|
||||
|
||||
def annotate_poly(image, contours):
|
||||
for cont in contours:
|
||||
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def annotate_boxes_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
page = annotate_poly(page, redaction_contours)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
74
vidocp/table_parsing_2.py
Normal file
74
vidocp/table_parsing_2.py
Normal file
@ -0,0 +1,74 @@
|
||||
import cv2
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pdf2image import pdf2image
|
||||
|
||||
|
||||
def add_external_contours(image, img):
|
||||
|
||||
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
|
||||
for cnt in contours:
|
||||
x, y, w, h = cv2.boundingRect(cnt)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def isolate_vertical_and_horizontal_components(img_bin):
|
||||
|
||||
line_min_width = 30
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
|
||||
return img_bin_final
|
||||
|
||||
|
||||
def annotate_image(image, stats):
|
||||
|
||||
image = image.copy()
|
||||
|
||||
for x, y, w, h, area in stats[2:]:
|
||||
if w > 10 and h > 10:
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
|
||||
|
||||
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
|
||||
anno = f"{s} = {v}"
|
||||
xann = int(x + 5)
|
||||
yann = int(y + h - (20 * (i + 1)))
|
||||
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def parse_table(image: np.array):
|
||||
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
|
||||
img_bin = isolate_vertical_and_horizontal_components(img_bin)
|
||||
img_bin_final = add_external_contours(img_bin, img_bin)
|
||||
|
||||
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
stats = parse_table(page)
|
||||
page = annotate_image(page, stats)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
Loading…
x
Reference in New Issue
Block a user