Merge branch 'master' of ssh://git.iqser.com:2222/rr/table_parsing into uncommon-tables

 Conflicts:
	scripts/annotate.py
	vidocp/table_parsig.py
This commit is contained in:
llocarnini 2022-02-06 15:10:32 +01:00
commit 27246f533a
18 changed files with 471 additions and 43 deletions

View File

@ -23,18 +23,21 @@ dvc pull
### As an API
The module provided functions for the individual tasks that all return some kid of collection of points, depending on
the specific task. Example for finding the outlines of previous redactions.
the specific task.
#### Redaction Detection
The below snippet shows hot to find the outlines of previous redactions.
```python
from vidocp.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
@ -52,17 +55,45 @@ Core API functionalities can be used through a CLI.
The tables parsing utility detects and segments tables into individual cells.
```bash
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
python scripts/annotate.py data/test_pdf.pdf 7 --type table
```
The below image shows a parsed table, where each table cell has been detected individually.
![](data/table_parsing.png)
#### Redaction Detection
The redaction detection utility detects previous redactions in PDFs (black filled rectangles).
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
```bash
python scripts/annotate.py <path to pdf> 0 --type redaction
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
```
The below image shows the detected redactions with green outlines.
![](data/redaction_detection.png)
#### Layout Parsing
The layout parsing utility detects elements such as paragraphs, tables and figures.
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
```
The below image shows the detected layout elements on a page.
![](data/layout_parsing.png)
#### Figure Detection
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
```bash
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
```
The below image shows the detected figure on a page.
![](data/figure_detection.png)

BIN
data/figure_detection.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 707 KiB

BIN
data/layout_parsing.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 568 KiB

BIN
data/table_parsing.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 566 KiB

View File

@ -1,15 +1,16 @@
import argparse
from vidocp.table_parsing_2 import annotate_tables_in_pdf
from vidocp.redaction_detection import annotate_boxes_in_pdf
from vidocp.layout_detection import annotate_layout_in_pdf
from vidocp.table_parsing import annotate_tables_in_pdf
from vidocp.redaction_detection import annotate_redactions_in_pdf
from vidocp.layout_parsing import annotate_layout_in_pdf
from vidocp.figure_detection import detect_figures_in_pdf
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("page_index", type=int)
parser.add_argument("--type", choices=["table", "redaction", "layout"], default="table")
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure"])
args = parser.parse_args()
@ -21,6 +22,8 @@ if __name__ == "__main__":
if args.type == "table":
annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
elif args.type == "redaction":
annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index)
annotate_redactions_in_pdf(args.pdf_path, page_index=args.page_index)
elif args.type == "layout":
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index)
elif args.type == "figure":
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index)

View File

@ -0,0 +1,39 @@
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils.detection import detect_large_coherent_structures
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_rectangles
from vidocp.utils.post_processing import remove_included
from vidocp.utils.filters import is_large_enough, has_acceptable_format
from vidocp.utils.text import remove_primary_text_regions
def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio)
def detect_figures(image: np.array):
image = image.copy()
image = remove_primary_text_regions(image)
cnts = detect_large_coherent_structures(image)
cnts = filter(is_likely_figure, cnts)
rects = map(cv2.boundingRect, cnts)
rects = remove_included(rects)
return rects
def detect_figures_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
redaction_contours = detect_figures(page)
page = draw_rectangles(page, redaction_contours)
show_mpl(page)

View File

@ -1,10 +1,8 @@
from itertools import count
import cv2
import imutils
import numpy as np
import pdf2image
from matplotlib import pyplot as plt
import imutils
def find_layout_boxes(image: np.array):

71
vidocp/layout_parsing.py Normal file
View File

@ -0,0 +1,71 @@
from itertools import compress
from itertools import starmap
from operator import __and__
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_rectangles
from vidocp.utils.post_processing import remove_overlapping, remove_included, has_no_parent
def is_likely_segment(rect, min_area=100):
return cv2.contourArea(rect, False) > min_area
def find_segments(image):
contours, hierarchies = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
mask1 = map(is_likely_segment, contours)
mask2 = map(has_no_parent, hierarchies[0])
mask = starmap(__and__, zip(mask1, mask2))
contours = compress(contours, mask)
rectangles = (cv2.boundingRect(c) for c in contours)
return rectangles
def parse_layout(image: np.array):
image = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (7, 7), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
dilate = cv2.dilate(thresh, kernel, iterations=4)
rects = list(find_segments(dilate))
# -> Run meta detection on the previous detections TODO: refactor
for rect in rects:
x, y, w, h = rect
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), -1)
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), 7)
_, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY)
image = ~image
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = find_segments(image)
# <- End of meta detection
rects = remove_included(rects)
rects = remove_overlapping(rects)
return rects
def annotate_layout_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
rects = parse_layout(page)
page = draw_rectangles(page, rects)
show_mpl(page)

View File

@ -4,22 +4,10 @@ import cv2
import numpy as np
import pdf2image
from iteration_utilities import starfilter, first
from matplotlib import pyplot as plt
def is_filled(hierarchy):
# See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
return hierarchy[3] <= 0 and hierarchy[2] == -1
def is_boxy(contour):
epsilon = 0.01 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, epsilon, True)
return len(approx) <= 10
def is_large_enough(contour, min_area):
return cv2.contourArea(contour, False) > min_area
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_contours
from vidocp.utils.filters import is_large_enough, is_filled, is_boxy
def is_likely_redaction(contour, hierarchy, min_area):
@ -34,7 +22,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
blurred = cv2.GaussianBlur(gray, (5, 5), 1)
thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]
contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
contours = map(
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
@ -42,22 +30,12 @@ def find_redactions(image: np.array, min_normalized_area=200000):
return contours
def annotate_poly(image, contours):
for cont in contours:
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
return image
def annotate_boxes_in_pdf(pdf_path, page_index=1):
def annotate_redactions_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
page = annotate_poly(page, redaction_contours)
page = draw_contours(page, redaction_contours)
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(page)
plt.show()
show_mpl(page)

56
vidocp/table_parsing.py Normal file
View File

@ -0,0 +1,56 @@
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_stats
def add_external_contours(image, img):
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin):
line_min_width = 30
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
return img_bin_final
def parse_table(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)
img_bin_final = add_external_contours(img_bin, img_bin)
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
return stats
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
stats = parse_table(page)
page = draw_stats(page, stats)
show_mpl(page)

1
vidocp/utils/__init__.py Normal file
View File

@ -0,0 +1 @@
from .utils import *

23
vidocp/utils/detection.py Normal file
View File

@ -0,0 +1,23 @@
import cv2
import numpy as np
def detect_large_coherent_structures(image: np.array):
"""Detects large coherent structures on an image.
References:
https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
return cnts

16
vidocp/utils/display.py Normal file
View File

@ -0,0 +1,16 @@
import cv2
from matplotlib import pyplot as plt
def show_mpl(image):
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(20, 20)
ax.imshow(image)
plt.show()
def show_cv2(image):
cv2.imshow("", image)
cv2.waitKey(0)

56
vidocp/utils/draw.py Normal file
View File

@ -0,0 +1,56 @@
import cv2
from vidocp.utils import copy_and_normalize_channels
def draw_contours(image, contours):
image = copy_and_normalize_channels(image)
for cont in contours:
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
return image
def draw_rectangles(image, rectangles, color=None):
image = copy_and_normalize_channels(image)
if not color:
color = (0, 255, 0)
for rect in rectangles:
x, y, w, h = rect
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
return image
def draw_stats(image, stats, annotate=False):
image = copy_and_normalize_channels(image)
keys = ["x", "y", "w", "h"]
def annotate_stat(x, y, w, h):
for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
anno = f"{s} = {v}"
xann = int(x + 5)
yann = int(y + h - (20 * (i + 1)))
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
def draw_stat(stat):
x, y, w, h, area = stat
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
if annotate:
annotate_stat(x, y, w, h)
for stat in stats[2:]:
draw_stat(stat)
return image

25
vidocp/utils/filters.py Normal file
View File

@ -0,0 +1,25 @@
import cv2
def is_large_enough(cont, min_area):
return cv2.contourArea(cont, False) > min_area
def has_acceptable_format(cont, max_width_to_height_ratio):
_, _, w, h = cv2.boundingRect(cont)
return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio)
def is_filled(hierarchy):
"""Checks whether a hierarchy is filled.
References:
https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
"""
return hierarchy[3] <= 0 and hierarchy[2] == -1
def is_boxy(contour):
epsilon = 0.01 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, epsilon, True)
return len(approx) <= 10

View File

@ -0,0 +1,62 @@
from collections import namedtuple
from functools import partial
def remove_overlapping(rectangles):
def overlap(a, b):
return compute_intersection(a, b) > 0
def does_not_overlap(rect, rectangles):
return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2)
rectangles = list(map(xywh_to_vec_rect, rectangles))
rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles
def remove_included(rectangles):
def included(a, b):
return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax
def is_not_included(rect, rectangles):
return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
rectangles = list(map(xywh_to_vec_rect, rectangles))
rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
def make_box(x1, y1, x2, y2):
keys = "x1", "y1", "x2", "y2"
return dict(zip(keys, [x1, y1, x2, y2]))
def compute_intersection(a, b):
dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
return dx * dy if (dx >= 0) and (dy >= 0) else 0
def has_no_parent(hierarchy):
return hierarchy[-1] <= 0
def xywh_to_vec_rect(rect):
x1, y1, w, h = rect
x2 = x1 + w
y2 = y1 + h
return Rectangle(x1, y1, x2, y2)
def vec_rect_to_xywh(rect):
x, y, x2, y2 = rect
w = x2 - x
h = y2 - y
return x, y, w, h

57
vidocp/utils/text.py Normal file
View File

@ -0,0 +1,57 @@
import cv2
def remove_primary_text_regions(image):
"""Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
Args:
image: Image to remove primary text from.
Returns:
Image with primary text removed.
"""
image = image.copy()
cnts = find_primary_text_regions(image)
for cnt in cnts:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
return image
def find_primary_text_regions(image):
"""Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
Args:
image: Image to remove primary text from.
Returns:
Image with primary text removed.
References:
https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
"""
def is_likely_primary_text_segments(cnt):
return 800 < cv2.contourArea(cnt) < 15000
image = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
dilate = cv2.dilate(close, dilate_kernel, iterations=1)
cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
cnts = filter(is_likely_primary_text_segments, cnts)
return cnts

12
vidocp/utils/utils.py Normal file
View File

@ -0,0 +1,12 @@
import cv2
def copy_and_normalize_channels(image):
image = image.copy()
try:
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
except cv2.error:
pass
return image