first working version with new API
This commit is contained in:
parent
8cd8c1b1f0
commit
a089fa5e42
1
.gitignore
vendored
1
.gitignore
vendored
@ -15,6 +15,7 @@ build_venv/
|
||||
/results/
|
||||
/data
|
||||
/table_parsing.egg-info
|
||||
/target/
|
||||
/tests/VV-313450.pdf
|
||||
/vidocp.egg-info/dependency_links.txt
|
||||
/vidocp.egg-info/PKG-INFO
|
||||
|
||||
10
config.yaml
10
config.yaml
@ -6,4 +6,12 @@ service:
|
||||
webserver:
|
||||
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
||||
port: $SERVER_PORT|5000 # webserver port
|
||||
mode: $SERVER_MODE|production # webserver mode: {development, production}
|
||||
mode: $SERVER_MODE|production # webserver mode: {development, production}
|
||||
|
||||
deskew:
|
||||
preprocess: True
|
||||
max_abs_angle: 1.5
|
||||
delta: 0.15
|
||||
mode: nearest
|
||||
verbose: False
|
||||
filter_strength_h: 3
|
||||
@ -1,3 +1,4 @@
|
||||
# python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing
|
||||
import argparse
|
||||
import json
|
||||
import requests
|
||||
@ -8,22 +9,38 @@ from vidocp.utils.preprocessing import open_pdf
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--pdf_path", required=True, help="path to PDF file")
|
||||
parser.add_argument("--first_page", type=int, required=True, help="path to PDF file")
|
||||
parser.add_argument("--last_page", type=int, required=False, default=None, help="path to PDF file")
|
||||
parser.add_argument(
|
||||
"--first_page", type=int, required=False, default=0, help="page number from which to start (starts at 0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--last_page",
|
||||
type=int,
|
||||
required=False,
|
||||
default=None,
|
||||
help="page number at which to stop (non-inclusive); specify None to go to the end",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--operations",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Comma-separated list of operations, any of the following: \ntable-parsing\nredaction-detection\
|
||||
\nfigure-detection\nlayout-detection",
|
||||
default="table-parsing"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
#data = open_pdf(args.pdf_path, args.first_page, args.last_page)
|
||||
# params = json.dumps({
|
||||
# "pdf_path": "a",#args.pdf_path,
|
||||
# "first_page": 4,#args.first_page,
|
||||
# "last_page": 6#args.last_page
|
||||
# })
|
||||
response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))#, json=params)
|
||||
# files = {"name": (
|
||||
# "name",
|
||||
# open(args.pdf_path, "rb"),
|
||||
# "file object corresponding to pdf file",
|
||||
# {"operations": args.operations.split(",")}
|
||||
# )
|
||||
# }
|
||||
response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
|
||||
response.raise_for_status()
|
||||
predictions = response.json()
|
||||
|
||||
@ -32,4 +49,4 @@ def main(args):
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
main(args)
|
||||
|
||||
@ -1,15 +1,16 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from typing import List
|
||||
from flask import Flask, request, jsonify
|
||||
from waitress import serve
|
||||
|
||||
from vidocp.utils import npconvert
|
||||
from vidocp.utils.preprocessing import preprocess_pdf_image #TODO
|
||||
from vidocp.table_parsing import parse_table#, detect_tables_in_pdf
|
||||
from vidocp.redaction_detection import find_redactions#, detect_redactions_in_pdf
|
||||
from vidocp.layout_parsing import parse_layout#, detect_layout_in_pdf #TODO
|
||||
from vidocp.figure_detection import detect_figures#, detect_figures_in_pdf #TODO
|
||||
from vidocp.utils.preprocessing import preprocess_pdf_image # TODO
|
||||
from vidocp.table_parsing import parse_table # , detect_tables_in_pdf
|
||||
from vidocp.redaction_detection import find_redactions # , detect_redactions_in_pdf
|
||||
from vidocp.layout_parsing import parse_layout # , detect_layout_in_pdf #TODO
|
||||
from vidocp.figure_detection import detect_figures # , detect_figures_in_pdf #TODO
|
||||
from vidocp.utils.logging import logger
|
||||
from vidocp.utils.preprocessing import open_pdf
|
||||
from vidocp.config import CONFIG
|
||||
@ -17,58 +18,39 @@ from vidocp.config import CONFIG
|
||||
|
||||
def suppress_user_warnings():
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
|
||||
# def parse_args():
|
||||
# parser = argparse.ArgumentParser()
|
||||
# parser.add_argument("--warnings", action="store_true", default=False)
|
||||
# args = parser.parse_args()
|
||||
|
||||
# return args
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
#if not args.warnings:
|
||||
# suppress_user_warnings()
|
||||
|
||||
run_server()
|
||||
|
||||
|
||||
def run_server():
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/", methods=["POST"])
|
||||
def predict_request():
|
||||
def inner():
|
||||
data = request.data
|
||||
#print(type(request))
|
||||
#print(dir(request))
|
||||
params = request.json
|
||||
#print("params:", params)
|
||||
logger.info(f"<3 Received data.")
|
||||
print("data type:", type(data))
|
||||
#print("json type:", type(params))
|
||||
logger.info(f"Processing data. <3")
|
||||
pdf_data = open_pdf(data)
|
||||
predictions = make_predictions(pdf_data)
|
||||
return jsonify({"result": predictions})
|
||||
try:
|
||||
return inner()
|
||||
except Exception as err:
|
||||
logger.warning("Analysis failed")
|
||||
logger.exception(err)
|
||||
resp = jsonify("Analysis failed")
|
||||
resp.status_code = 500
|
||||
return resp
|
||||
@app.route("/tables", methods=["POST"])
|
||||
def get_tables():
|
||||
return annotate("tables")
|
||||
|
||||
@app.route("/redactions", methods=["POST"])
|
||||
def get_redactions():
|
||||
return annotate("redactions")
|
||||
|
||||
@app.route("/figures", methods=["POST"])
|
||||
def get_figures():
|
||||
return annotate("figures")
|
||||
|
||||
@app.route("/layout", methods=["POST"])
|
||||
def get_layout():
|
||||
return annotate("layout")
|
||||
|
||||
@app.route("/status", methods=["GET"])
|
||||
def status():
|
||||
response = "OK"
|
||||
return jsonify(response)
|
||||
|
||||
#predictor = initialize_predictor()
|
||||
# predictor = initialize_predictor()
|
||||
logger.info("<3 Annotator ready.")
|
||||
|
||||
mode = CONFIG.webserver.mode
|
||||
@ -79,21 +61,49 @@ def run_server():
|
||||
logging.info("Production.")
|
||||
|
||||
|
||||
def make_predictions(pdf_data):
|
||||
output = {}
|
||||
pdf = open_pdf(pdf_data)
|
||||
for i, page in enumerate(pdf):
|
||||
page = preprocess_pdf_image(page)
|
||||
tables = json.dumps(list(parse_table(page)), default=npconvert) #list() for consistency; not strictly necessary
|
||||
redactions = json.dumps(list(find_redactions(page)), default=npconvert)
|
||||
layout = json.dumps(list(parse_layout(page)), default=npconvert)
|
||||
figure = json.dumps(list(detect_figures(page)), default=npconvert)
|
||||
output.update({i: {"tables": tables,
|
||||
"redactions": redactions,
|
||||
"layout": layout,
|
||||
"figure": figure}})
|
||||
return output
|
||||
def apply_annotation_function(annotation_function, page_list):
|
||||
outdict = {}
|
||||
for i, page in enumerate(page_list):
|
||||
results = annotation_function(page)
|
||||
if results:
|
||||
outdict.update({i: results})
|
||||
return outdict
|
||||
|
||||
|
||||
def make_annotations(pdf_data, task):
|
||||
pdf = open_pdf(pdf_data)
|
||||
|
||||
if task == "tables":
|
||||
annotation = {"tables": apply_annotation_function(parse_table, pdf)}
|
||||
elif task == "redactions":
|
||||
annotation = {"redactions": apply_annotation_function(find_redactions, pdf)}
|
||||
elif task == "figures":
|
||||
annotation = {"figures": apply_annotation_function(detect_figures, pdf)}
|
||||
elif task == "layout":
|
||||
annotation = {"layout": apply_annotation_function(parse_layout, pdf)}
|
||||
else:
|
||||
raise ValueError(f"'{task}' is not a valid operation keyword. Valid values include: \
|
||||
\ntables\nredactions\nfigures\nlayout\n")
|
||||
|
||||
return json.dumps(annotation, default=npconvert)
|
||||
|
||||
|
||||
def annotate(task):
|
||||
def inner():
|
||||
data = request.data
|
||||
logger.info(f"<3 Received data.")
|
||||
logger.info(f"Processing data. <3")
|
||||
annotations = make_annotations(data, task)
|
||||
return jsonify({"result": annotations})
|
||||
try:
|
||||
return inner()
|
||||
except Exception as err:
|
||||
logger.warning("Analysis failed")
|
||||
logger.exception(err)
|
||||
resp = jsonify("Analysis failed")
|
||||
resp.status_code = 500
|
||||
return resp
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -21,9 +21,8 @@ def test_num_of_rects(rects):
|
||||
def test_range_of_rects(rects):
|
||||
expected_range = ((210, 605), (1430, 1620))
|
||||
topleft = min(rects)
|
||||
x,y,w,h = max(rects)
|
||||
bottomright = (x+w, y+h)
|
||||
x, y, w, h = max(rects)
|
||||
bottomright = (x + w, y + h)
|
||||
|
||||
assert topleft >= expected_range[0]
|
||||
assert bottomright <= expected_range[1]
|
||||
|
||||
|
||||
@ -36,4 +36,4 @@ class Config:
|
||||
return self.__getattr__(item)
|
||||
|
||||
|
||||
CONFIG = Config(CONFIG_FILE)
|
||||
CONFIG = Config(CONFIG_FILE)
|
||||
|
||||
@ -25,7 +25,7 @@ def detect_figures(image: np.array):
|
||||
rects = map(cv2.boundingRect, cnts)
|
||||
rects = remove_included(rects)
|
||||
|
||||
return rects
|
||||
return list(rects)
|
||||
|
||||
|
||||
def detect_figures_in_pdf(pdf_path, page_index=1, show=True):
|
||||
|
||||
@ -32,7 +32,7 @@ def parse_layout(image: np.array):
|
||||
|
||||
image = image.copy()
|
||||
image_ = image.copy()
|
||||
|
||||
|
||||
if len(image_.shape) > 2:
|
||||
image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2GRAY)
|
||||
image_ = cv2.GaussianBlur(image_, (7, 7), 0)
|
||||
@ -60,7 +60,7 @@ def parse_layout(image: np.array):
|
||||
rects = remove_included(rects)
|
||||
rects = remove_overlapping(rects)
|
||||
|
||||
return rects
|
||||
return list(rects)
|
||||
|
||||
|
||||
def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):
|
||||
@ -70,7 +70,7 @@ def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):
|
||||
|
||||
rects = parse_layout(page)
|
||||
page = draw_rectangles(page, rects)
|
||||
|
||||
|
||||
if show:
|
||||
show_mpl(page)
|
||||
else:
|
||||
|
||||
@ -11,4 +11,4 @@ LOG_FILE = "/tmp/log.log"
|
||||
|
||||
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
|
||||
|
||||
TEST_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "test", "test_data")
|
||||
TEST_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "test", "test_data")
|
||||
|
||||
@ -30,7 +30,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
|
||||
contours = map(
|
||||
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
|
||||
)
|
||||
return contours
|
||||
return list(contours)
|
||||
|
||||
|
||||
def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):
|
||||
|
||||
@ -1,171 +0,0 @@
|
||||
from itertools import count
|
||||
|
||||
import cv2
|
||||
import imutils
|
||||
import numpy as np
|
||||
import pdf2image
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
|
||||
def parse(image: np.array):
|
||||
if len(image.shape) > 2:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
#plt.imshow(gray_scale)
|
||||
image = cv2.GaussianBlur(image, (7, 7), 2) #5 5 1
|
||||
image = cv2.threshold(image, 251, 255, cv2.THRESH_BINARY)[1]
|
||||
#plt.imshow(thresh)
|
||||
img_bin = ~image
|
||||
|
||||
line_min_width = 7
|
||||
kernel_h = np.ones((10, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 10), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
#plt.imshow(img_bin_h)
|
||||
#plt.imshow(img_bin_v)
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
plt.imshow(img_bin_final)
|
||||
contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = imutils.grab_contours(contours)
|
||||
for c in contours:
|
||||
peri = cv2.arcLength(c, True)
|
||||
approx = cv2.approxPolyDP(c, 0.04 * peri, True)
|
||||
yield cv2.boundingRect(approx)
|
||||
|
||||
def parse_tables(image: np.array, rects: list):
|
||||
parsed_tables = []
|
||||
for rect in rects:
|
||||
(x,y,w,h) = rect
|
||||
region_of_interest = image[x:x+w, y:y+h]
|
||||
if len(region_of_interest.shape) > 2:
|
||||
region_of_interest = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
|
||||
region_of_interest = cv2.threshold(region_of_interest, 200, 255, cv2.THRESH_BINARY)[1]
|
||||
img_bin = ~region_of_interest
|
||||
|
||||
line_min_width = 5
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
# find_and_close_internal_gaps(img_bin_v)
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
#plt.imshow(img_bin_final)
|
||||
# find_and_close_internal_gaps(img_bin_final)
|
||||
# find_and_close_edges(img_bin_final)
|
||||
|
||||
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
parsed_tables.append([(x,y,w,h), stats])
|
||||
return parsed_tables
|
||||
#yield (x,y,w,h), stats, region_of_interest
|
||||
# return stats
|
||||
|
||||
def annotate_table(image, parsed_tables):
|
||||
for table in parsed_tables:
|
||||
original_coordinates, stats = table
|
||||
stats = filter_unconnected_cells(stats)
|
||||
for stat in stats:
|
||||
x, y, w, h, area = stat
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
|
||||
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
|
||||
anno = f"{s} = {v}"
|
||||
xann = int(x + 5)
|
||||
yann = int(y + h - (20 * (i + 1)))
|
||||
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def filter_unconnected_cells(stats):
|
||||
filtered_cells = []
|
||||
# print(stats)
|
||||
for left, middle, right in zip(stats[0:], stats[1:],
|
||||
list(stats[2:]) + [np.array([None, None, None, None, None])]):
|
||||
x, y, w, h, area = middle
|
||||
if w > 35 and h > 13 and area > 500:
|
||||
if right[1] is None:
|
||||
if y == left[1] or x == left[0]:
|
||||
filtered_cells.append(middle)
|
||||
else:
|
||||
if y == left[1] or y == right[1] or x == left[0] or x == right[0]:
|
||||
filtered_cells.append(middle)
|
||||
return filtered_cells
|
||||
|
||||
def find_and_close_edges(img_bin_final):
|
||||
contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for cnt in contours:
|
||||
missing_external_edges = True
|
||||
left = tuple(cnt[cnt[:, :, 0].argmin()][0])
|
||||
right = tuple(cnt[cnt[:, :, 0].argmax()][0])
|
||||
top = tuple(cnt[cnt[:, :, 1].argmin()][0])
|
||||
bottom = tuple(cnt[cnt[:, :, 1].argmax()][0])
|
||||
topleft = [left[0], top[1]]
|
||||
bottomright = [right[0], bottom[1]]
|
||||
for arr in cnt:
|
||||
if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])):
|
||||
missing_external_edges = False
|
||||
break
|
||||
|
||||
if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000:
|
||||
cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2)
|
||||
# print("missing cell detectet rectangle drawn")
|
||||
|
||||
return img_bin_final
|
||||
|
||||
|
||||
|
||||
def parse_tables_in_pdf(pages):
|
||||
return zip(map(parse, pages), count())
|
||||
|
||||
# def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
# # timeit()
|
||||
# page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
# page = np.array(page)
|
||||
#
|
||||
# _, stats = parse(page)
|
||||
# page = annotate_image(page, stats)
|
||||
# # print(timeit())
|
||||
# fig, ax = plt.subplots(1, 1)
|
||||
# fig.set_size_inches(20, 20)
|
||||
# ax.imshow(page)
|
||||
# plt.show()
|
||||
|
||||
|
||||
def annotate_boxes(image, rects):
|
||||
print(type(rects))
|
||||
for rect in rects:
|
||||
(x, y, w, h) = rect
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
||||
|
||||
return image
|
||||
|
||||
def filter_tables_or_images(rects):
|
||||
filtered = []
|
||||
for rect in rects:
|
||||
(x,y,w,h) = rect
|
||||
print(w*h)
|
||||
if w * h > 10**6:
|
||||
filtered.append(rect)
|
||||
print(filtered)
|
||||
return filtered
|
||||
|
||||
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
layout_boxes = parse(page)
|
||||
page = annotate_boxes(page, layout_boxes)
|
||||
parsed_tables = parse_tables(page, filter_tables_or_images(layout_boxes))
|
||||
page = annotate_table(page, parsed_tables)
|
||||
|
||||
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
@ -1,5 +1,5 @@
|
||||
from functools import partial
|
||||
from itertools import chain, compress, starmap
|
||||
from itertools import chain, starmap
|
||||
from operator import attrgetter
|
||||
|
||||
import cv2
|
||||
@ -25,7 +25,6 @@ def add_external_contours(image, img):
|
||||
return image
|
||||
|
||||
|
||||
|
||||
def isolate_vertical_and_horizontal_components(img_bin, bounding_rects, show=False):
|
||||
line_min_width = 48
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
@ -77,19 +76,9 @@ def has_table_shape(rects):
|
||||
|
||||
rects = list(map(xywh_to_vec_rect, rects))
|
||||
|
||||
# print(rects)
|
||||
# print(brect)
|
||||
|
||||
def matches_bounding_rect_corner(rect, x, y):
|
||||
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
|
||||
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
|
||||
# print()
|
||||
# print(x, y)
|
||||
# print(brect)
|
||||
# print(rect)
|
||||
# print(corresp_coords)
|
||||
# print(ret)
|
||||
|
||||
return ret
|
||||
|
||||
return all(
|
||||
@ -120,31 +109,36 @@ def find_table_layout_boxes(image: np.array):
|
||||
return table_boxes
|
||||
|
||||
|
||||
def preprocess(image: np.array):
|
||||
if len(image.shape) > 2:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
th1, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
||||
image = ~image
|
||||
return image
|
||||
|
||||
|
||||
def parse_table(image: np.array, show=False):
|
||||
def is_large_enough(stat):
|
||||
x1, y1, w, h, area = stat
|
||||
return area > 2000 and w > 35 and h > 25
|
||||
|
||||
if len(image.shape) > 2:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
th1, img_bin = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
image = preprocess(image)
|
||||
if show:
|
||||
show_mpl(img_bin)
|
||||
show_mpl(image)
|
||||
|
||||
table_layout_boxes = find_table_layout_boxes(image)
|
||||
img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes)
|
||||
img_bin_final = add_external_contours(img_bin, img_bin)
|
||||
image = isolate_vertical_and_horizontal_components(image, table_layout_boxes)
|
||||
image = add_external_contours(image, image)
|
||||
|
||||
_, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
stats = np.vstack(list(filter(is_large_enough, stats)))
|
||||
rects = stats[:, :-1][2:]
|
||||
|
||||
# FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
|
||||
rects = list(remove_isolated(rects, input_sorted=True))
|
||||
rects = remove_isolated(rects, input_sorted=True)
|
||||
|
||||
return rects
|
||||
return list(rects)
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):
|
||||
@ -155,9 +149,7 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):
|
||||
|
||||
stats = parse_table(page)
|
||||
page = draw_rectangles(page, stats, annotate=True)
|
||||
# if stats:
|
||||
# page = draw_rectangles(page, stats, annotate=True)
|
||||
|
||||
|
||||
if show:
|
||||
show_mpl(page)
|
||||
else:
|
||||
|
||||
@ -1,75 +0,0 @@
|
||||
import cv2
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pdf2image import pdf2image
|
||||
|
||||
|
||||
def add_external_contours(image, img):
|
||||
|
||||
contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
|
||||
for cnt in contours:
|
||||
x, y, w, h = cv2.boundingRect(cnt)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def isolate_vertical_and_horizontal_components(img_bin):
|
||||
|
||||
line_min_width = 30
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
|
||||
img_bin_final = img_bin_h | img_bin_v
|
||||
|
||||
return img_bin_final
|
||||
|
||||
|
||||
def annotate_image(image, stats):
|
||||
|
||||
image = image.copy()
|
||||
|
||||
for x, y, w, h, area in stats[2:]:
|
||||
if w > 10 and h > 10:
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
|
||||
|
||||
for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
|
||||
anno = f"{s} = {v}"
|
||||
xann = int(x + 5)
|
||||
yann = int(y + h - (20 * (i + 1)))
|
||||
cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def parse_table(image: np.array):
|
||||
|
||||
if len(image.shape) > 2:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
th1, img_bin = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
|
||||
img_bin = isolate_vertical_and_horizontal_components(img_bin)
|
||||
img_bin_final = add_external_contours(img_bin, img_bin)
|
||||
|
||||
_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
stats = parse_table(page)
|
||||
page = annotate_image(page, stats)
|
||||
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
ax.imshow(page)
|
||||
plt.show()
|
||||
@ -2,78 +2,46 @@ import numpy as np
|
||||
from scipy.ndimage import rotate
|
||||
import cv2
|
||||
|
||||
|
||||
def detect_angle_from_lines(im: np.array, max_skew_deg=10, min_skew_deg=0.1, min_nlines=5) -> int:
|
||||
max_skew_rad = np.deg2rad(max_skew_deg)
|
||||
min_skew_rad = np.deg2rad(min_skew_deg)
|
||||
width = im.shape[1]
|
||||
|
||||
if len(im.shape) > 2:
|
||||
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
||||
im = cv2.fastNlMeansDenoising(im, h=3)
|
||||
im_bw = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
|
||||
|
||||
lines = cv2.HoughLinesP(im_bw, 1, np.pi / 180, 200, minLineLength=width / 12, maxLineGap=width / 150)
|
||||
|
||||
angles = []
|
||||
for line in lines:
|
||||
x1, y1, x2, y2 = line[0]
|
||||
raw_angle = np.arctan2(y2 - y1, x2 - x1)
|
||||
angles.append(min(raw_angle, np.pi / 2 - raw_angle))
|
||||
angles = [angle for angle in angles if (abs(angle) < max_skew_rad)]
|
||||
nonzero = list(filter(lambda x: x != 0, angles))
|
||||
|
||||
# empirically found this ad hoc approach to work
|
||||
robust_avg = (np.mean(angles) + np.mean(nonzero) + np.median(nonzero)) / 3
|
||||
# slightly lower alternative:
|
||||
# robust_avg = (np.mean(angles) + np.mean(nonzero) + np.median(angles) + np.median(nonzero)) / 4
|
||||
|
||||
if robust_avg < min_skew_rad or min(len(angles), len(nonzero)) < min_nlines:
|
||||
return 0.0
|
||||
return np.rad2deg(robust_avg)
|
||||
from vidocp.config import CONFIG
|
||||
|
||||
|
||||
def rotate_straight(im: np.array, skew_angle: int) -> np.array:
|
||||
h, w = im.shape[:2]
|
||||
center = (w // 2, h // 2)
|
||||
|
||||
M = cv2.getRotationMatrix2D(center, skew_angle, 1.0)
|
||||
|
||||
rotated = cv2.warpAffine(im, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
||||
return rotated
|
||||
|
||||
|
||||
def deskew_linebased(image: np.array, verbose=False) -> np.array:
|
||||
skew_angle = detect_angle_from_lines(image)
|
||||
if verbose:
|
||||
print(f"Skew angle from lines: {skew_angle}")
|
||||
if skew_angle:
|
||||
deskewed = rotate_straight(image, skew_angle)
|
||||
return deskewed
|
||||
return image
|
||||
def find_score(arr, angle):
|
||||
data = rotate(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
|
||||
hist = np.sum(data, axis=1)
|
||||
score = np.sum((hist[1:] - hist[:-1]) ** 2)
|
||||
return score
|
||||
|
||||
|
||||
def deskew_histbased(page: np.array, preprocess=True, max_abs_angle=1.5, delta=0.15, mode="nearest", verbose=False):
|
||||
if preprocess:
|
||||
if len(page.shape) > 2:
|
||||
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
|
||||
page = cv2.fastNlMeansDenoising(page, h=3)
|
||||
w, h = page.shape
|
||||
|
||||
def find_score(arr, angle):
|
||||
data = rotate(arr, angle, reshape=False, order=0)
|
||||
hist = np.sum(data, axis=1)
|
||||
score = np.sum((hist[1:] - hist[:-1]) ** 2)
|
||||
return score
|
||||
|
||||
angles = np.arange(-max_abs_angle, max_abs_angle + delta, delta)
|
||||
scores = []
|
||||
for angle in angles:
|
||||
scores.append(find_score(page, angle))
|
||||
|
||||
def find_best_angle(page):
|
||||
lim = CONFIG.deskew.max_abs_angle
|
||||
delta = CONFIG.deskew.delta
|
||||
angles = np.arange(-lim, lim + delta, delta)
|
||||
scores = [find_score(page, angle) for angle in angles]
|
||||
best_angle = angles[scores.index(max(scores))]
|
||||
if verbose:
|
||||
return best_angle
|
||||
|
||||
|
||||
def preprocess(arr: np.array):
|
||||
if len(arr.shape) > 2:
|
||||
arr = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
|
||||
arr = cv2.fastNlMeansDenoising(arr, h=CONFIG.deskew.filter_strength_h)
|
||||
return arr
|
||||
|
||||
|
||||
def deskew_histbased(page: np.array):
|
||||
page = preprocess(page)
|
||||
best_angle = find_best_angle(page)
|
||||
|
||||
if CONFIG.deskew.verbose:
|
||||
print("Skew angle from pixel histogram: {}".format(best_angle))
|
||||
|
||||
rotated = rotate(page, best_angle, reshape=False, order=0, mode=mode)
|
||||
rotated = rotate(page, best_angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
|
||||
return rotated, best_angle
|
||||
|
||||
@ -19,6 +19,6 @@ def detect_large_coherent_structures(image: np.array):
|
||||
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
|
||||
close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
|
||||
|
||||
cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
counts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
return cnts
|
||||
return counts
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
"""Defines the default logger for the service."""
|
||||
|
||||
|
||||
import sys
|
||||
import logging
|
||||
|
||||
from vidocp.config import CONFIG
|
||||
|
||||
|
||||
def get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.getLevelName(CONFIG.service.logging_level))
|
||||
@ -19,4 +18,5 @@ def get_logger():
|
||||
logger.propagate = False
|
||||
return logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
@ -26,7 +26,7 @@ def remove_included(rectangles):
|
||||
|
||||
def is_not_included(rect, rectangles):
|
||||
return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
|
||||
|
||||
|
||||
rectangles = list(map(xywh_to_vec_rect, rectangles))
|
||||
rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
|
||||
rectangles = map(vec_rect_to_xywh, rectangles)
|
||||
|
||||
@ -3,6 +3,13 @@ import pdf2image
|
||||
import cv2
|
||||
|
||||
|
||||
def preprocess_pdf_image(page):
|
||||
if len(page.shape) > 2:
|
||||
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
|
||||
page = cv2.fastNlMeansDenoising(page, h=3)
|
||||
return page
|
||||
|
||||
|
||||
def open_pdf(pdf, first_page=0, last_page=None):
|
||||
first_page += 1
|
||||
last_page = None if last_page is None else last_page + 1
|
||||
@ -12,12 +19,5 @@ def open_pdf(pdf, first_page=0, last_page=None):
|
||||
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
|
||||
elif type(pdf) == list:
|
||||
return pdf
|
||||
pages = [array(p) for p in pages]
|
||||
pages = [preprocess_pdf_image(array(p)) for p in pages]
|
||||
return pages
|
||||
|
||||
|
||||
def preprocess_pdf_image(page):
|
||||
if len(page.shape) > 2:
|
||||
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
|
||||
page = cv2.fastNlMeansDenoising(page, h=3)
|
||||
return page
|
||||
@ -14,5 +14,6 @@ def copy_and_normalize_channels(image):
|
||||
|
||||
|
||||
def npconvert(ob):
|
||||
if isinstance(ob, generic): return ob.item()
|
||||
raise TypeError
|
||||
if isinstance(ob, generic):
|
||||
return ob.item()
|
||||
raise TypeError
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user