Merge branch 'table_parsing_version_2' of ssh://git.iqser.com:2222/rr/table_parsing into uncommon-tables

 Conflicts:
	requirements.txt
This commit is contained in:
llocarnini 2022-02-05 18:03:14 +01:00
commit 17b8e3a16e
22 changed files with 818 additions and 21 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

7
.dvc/config Normal file
View File

@ -0,0 +1,7 @@
[core]
remote = vector
autostage = true
['remote "vector"']
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
port = 22

107
.dvc/plots/confusion.json Normal file
View File

@ -0,0 +1,107 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"facet": {
"field": "rev",
"type": "nominal"
},
"spec": {
"transform": [
{
"aggregate": [
{
"op": "count",
"as": "xy_count"
}
],
"groupby": [
"<DVC_METRIC_Y>",
"<DVC_METRIC_X>"
]
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_Y>"
],
"key": "<DVC_METRIC_X>",
"value": 0
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_X>"
],
"key": "<DVC_METRIC_Y>",
"value": 0
},
{
"joinaggregate": [
{
"op": "max",
"field": "xy_count",
"as": "max_count"
}
],
"groupby": []
},
{
"calculate": "datum.xy_count / datum.max_count",
"as": "percent_of_max"
}
],
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_Y_LABEL>"
}
},
"layer": [
{
"mark": "rect",
"width": 300,
"height": 300,
"encoding": {
"color": {
"field": "xy_count",
"type": "quantitative",
"title": "",
"scale": {
"domainMin": 0,
"nice": true
}
}
}
},
{
"mark": "text",
"encoding": {
"text": {
"field": "xy_count",
"type": "quantitative"
},
"color": {
"condition": {
"test": "datum.percent_of_max > 0.5",
"value": "white"
},
"value": "black"
}
}
}
]
}
}

View File

@ -0,0 +1,112 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"facet": {
"field": "rev",
"type": "nominal"
},
"spec": {
"transform": [
{
"aggregate": [
{
"op": "count",
"as": "xy_count"
}
],
"groupby": [
"<DVC_METRIC_Y>",
"<DVC_METRIC_X>"
]
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_Y>"
],
"key": "<DVC_METRIC_X>",
"value": 0
},
{
"impute": "xy_count",
"groupby": [
"rev",
"<DVC_METRIC_X>"
],
"key": "<DVC_METRIC_Y>",
"value": 0
},
{
"joinaggregate": [
{
"op": "sum",
"field": "xy_count",
"as": "sum_y"
}
],
"groupby": [
"<DVC_METRIC_Y>"
]
},
{
"calculate": "datum.xy_count / datum.sum_y",
"as": "percent_of_y"
}
],
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "nominal",
"sort": "ascending",
"title": "<DVC_METRIC_Y_LABEL>"
}
},
"layer": [
{
"mark": "rect",
"width": 300,
"height": 300,
"encoding": {
"color": {
"field": "percent_of_y",
"type": "quantitative",
"title": "",
"scale": {
"domain": [
0,
1
]
}
}
}
},
{
"mark": "text",
"encoding": {
"text": {
"field": "percent_of_y",
"type": "quantitative",
"format": ".2f"
},
"color": {
"condition": {
"test": "datum.percent_of_y > 0.5",
"value": "white"
},
"value": "black"
}
}
}
]
}
}

116
.dvc/plots/linear.json Normal file
View File

@ -0,0 +1,116 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"width": 300,
"height": 300,
"layer": [
{
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"layer": [
{
"mark": "line"
},
{
"selection": {
"label": {
"type": "single",
"nearest": true,
"on": "mouseover",
"encodings": [
"x"
],
"empty": "none",
"clear": "mouseout"
}
},
"mark": "point",
"encoding": {
"opacity": {
"condition": {
"selection": "label",
"value": 1
},
"value": 0
}
}
}
]
},
{
"transform": [
{
"filter": {
"selection": "label"
}
}
],
"layer": [
{
"mark": {
"type": "rule",
"color": "gray"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative"
}
}
},
{
"encoding": {
"text": {
"type": "quantitative",
"field": "<DVC_METRIC_Y>"
},
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative"
}
},
"layer": [
{
"mark": {
"type": "text",
"align": "left",
"dx": 5,
"dy": -5
},
"encoding": {
"color": {
"type": "nominal",
"field": "rev"
}
}
}
]
}
]
}
]
}

104
.dvc/plots/scatter.json Normal file
View File

@ -0,0 +1,104 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"width": 300,
"height": 300,
"layer": [
{
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"layer": [
{
"mark": "point"
},
{
"selection": {
"label": {
"type": "single",
"nearest": true,
"on": "mouseover",
"encodings": [
"x"
],
"empty": "none",
"clear": "mouseout"
}
},
"mark": "point",
"encoding": {
"opacity": {
"condition": {
"selection": "label",
"value": 1
},
"value": 0
}
}
}
]
},
{
"transform": [
{
"filter": {
"selection": "label"
}
}
],
"layer": [
{
"encoding": {
"text": {
"type": "quantitative",
"field": "<DVC_METRIC_Y>"
},
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative"
}
},
"layer": [
{
"mark": {
"type": "text",
"align": "left",
"dx": 5,
"dy": -5
},
"encoding": {
"color": {
"type": "nominal",
"field": "rev"
}
}
}
]
}
]
}
]
}

31
.dvc/plots/simple.json Normal file
View File

@ -0,0 +1,31 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"width": 300,
"height": 300,
"mark": {
"type": "line"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
}
}

39
.dvc/plots/smooth.json Normal file
View File

@ -0,0 +1,39 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"values": "<DVC_METRIC_DATA>"
},
"title": "<DVC_METRIC_TITLE>",
"mark": {
"type": "line"
},
"encoding": {
"x": {
"field": "<DVC_METRIC_X>",
"type": "quantitative",
"title": "<DVC_METRIC_X_LABEL>"
},
"y": {
"field": "<DVC_METRIC_Y>",
"type": "quantitative",
"title": "<DVC_METRIC_Y_LABEL>",
"scale": {
"zero": false
}
},
"color": {
"field": "rev",
"type": "nominal"
}
},
"transform": [
{
"loess": "<DVC_METRIC_Y>",
"on": "<DVC_METRIC_X>",
"groupby": [
"rev"
],
"bandwidth": 0.3
}
]
}

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

68
README.md Normal file
View File

@ -0,0 +1,68 @@
# Vidocp &mdash; Visual Document Parsing
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.
## Installation
```bash
git clone ssh://git@git.iqser.com:2222/rr/vidocp.git
cd vidocp
python -m venv env
source env/bin/activate
pip install -e .
pip install -r requirements.txt
dvc pull
```
## Usage
### As an API
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
the specific task. Example for finding the outlines of previous redactions.
```python
from vidocp.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
```
### As a CLI Tool
Core API functionalities can be used through a CLI.
#### Table Parsing
The tables parsing utility detects and segments tables into individual cells.
```bash
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
```
#### Redaction Detection
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
```bash
python scripts/annotate.py <path to pdf> 0 --type redaction
```
The below image shows the detected redactions with green outlines.
![](data/redaction_detection.png)

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/test_pdf.pdf

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 MiB

4
data/test_pdf.pdf.dvc Normal file
View File

@ -0,0 +1,4 @@
outs:
- md5: 60840305e4ddb084aea21976b8b7c49e
size: 6916053
path: test_pdf.pdf

View File

@ -2,4 +2,8 @@ opencv-python~=4.5.5.62
numpy~=1.22.1
pdf2image~=1.16.0
matplotlib~=3.5.1
imutils~=0.5.4
imutils==0.5.4
iteration-utilities==0.11.0
dvc==2.9.3
dvc[ssh]

26
scripts/annotate.py Normal file
View File

@ -0,0 +1,26 @@
import argparse
from vidocp.table_parsing_2 import annotate_tables_in_pdf
from vidocp.redaction_detection import annotate_boxes_in_pdf
from vidocp.layout_detection import annotate_layout_in_pdf
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("page_index", type=int)
parser.add_argument("--type", choices=["table", "redaction", "layout"], default="table")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
if args.type == "table":
annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
elif args.type == "redaction":
annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index)
elif args.type == "layout":
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index)

View File

@ -1,18 +0,0 @@
import argparse
from table_parsing.table_parsig import annotate_tables_in_pdf
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("page_index", type=int)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)

View File

@ -3,11 +3,11 @@
from distutils.core import setup
setup(
name="table_parsing",
name="vidocp",
version="0.0.1",
description="",
author="",
author_email="",
url="",
packages=["table_parsing"],
packages=["vidocp"],
)

View File

@ -0,0 +1,53 @@
from itertools import count
import cv2
import numpy as np
import pdf2image
from matplotlib import pyplot as plt
import imutils
def find_layout_boxes(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1)
thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1]
img_bin = ~thresh
line_min_width = 10
kernel_h = np.ones((10, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 10), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = imutils.grab_contours(contours)
for c in contours:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.04 * peri, True)
yield cv2.boundingRect(approx)
def annotate_layout_boxes(image, rects):
for rect in rects:
(x, y, w, h) = rect
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
return image
def annotate_layout_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
layout_boxes = find_layout_boxes(page)
page = annotate_layout_boxes(page, layout_boxes)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(page)
plt.show()

View File

@ -0,0 +1,63 @@
from functools import partial
import cv2
import numpy as np
import pdf2image
from iteration_utilities import starfilter, first
from matplotlib import pyplot as plt
def is_filled(hierarchy):
# See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
return hierarchy[3] <= 0 and hierarchy[2] == -1
def is_boxy(contour):
epsilon = 0.01 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, epsilon, True)
return len(approx) <= 10
def is_large_enough(contour, min_area):
return cv2.contourArea(contour, False) > min_area
def is_likely_redaction(contour, hierarchy, min_area):
return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
def find_redactions(image: np.array, min_normalized_area=200000):
min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution
gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 1)
thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]
contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
contours = map(
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
)
return contours
def annotate_poly(image, contours):
for cont in contours:
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
return image
def annotate_boxes_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
page = annotate_poly(page, redaction_contours)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(page)
plt.show()

74
vidocp/table_parsing_2.py Normal file
View File

@ -0,0 +1,74 @@
import cv2
import matplotlib.pyplot as plt
import numpy as np
from pdf2image import pdf2image
def add_external_contours(image, img):
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin):
line_min_width = 30
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
return img_bin_final
def annotate_image(image, stats):
image = image.copy()
for x, y, w, h, area in stats[2:]:
if w > 10 and h > 10:
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
anno = f"{s} = {v}"
xann = int(x + 5)
yann = int(y + h - (20 * (i + 1)))
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
return image
def parse_table(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)
img_bin_final = add_external_contours(img_bin, img_bin)
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
return stats
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
stats = parse_table(page)
page = annotate_image(page, stats)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(page)
plt.show()