first fully working containerization; still needs environment variables; review request data format

This commit is contained in:
Isaac Riley 2022-03-08 10:01:25 +01:00
parent 7784993d1f
commit 8b9621e798
19 changed files with 191 additions and 72 deletions

View File

@ -8,6 +8,7 @@ WORKDIR /app/service
COPY ./src ./src
COPY vidocp ./vidocp
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install -e .
WORKDIR /app/service

View File

@ -12,7 +12,6 @@ WORKDIR /app/service
COPY . ./
# Install dependencies.
RUN apt-get update && apt-get install -y python3-opencv
RUN python3 -m pip install -r requirements.txt
# Make a new container and copy all relevant files over to filter out temporary files
@ -23,4 +22,8 @@ WORKDIR /app/
COPY --from=builder1 /app .
ENV PATH="/app/venv/bin:$PATH"
WORKDIR /app/service
WORKDIR /app/service
RUN apt update
#RUN apt install python3-opencv-headless
RUN apt install poppler-utils --yes

View File

@ -1,4 +1,4 @@
opencv-python~=4.5.5.62
opencv-python-headless~=4.5.5.62
numpy~=1.22.1
pdf2image~=1.16.0
matplotlib~=3.5.1

View File

@ -0,0 +1,35 @@
import argparse
import json
import requests
from vidocp.utils.preprocessing import open_pdf
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--pdf_path", required=True, help="path to PDF file")
parser.add_argument("--first_page", type=int, required=True, help="path to PDF file")
parser.add_argument("--last_page", type=int, required=False, default=None, help="path to PDF file")
args = parser.parse_args()
return args
def main(args):
#data = open_pdf(args.pdf_path, args.first_page, args.last_page)
# params = json.dumps({
# "pdf_path": "a",#args.pdf_path,
# "first_page": 4,#args.first_page,
# "last_page": 6#args.last_page
# })
response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))#, json=params)
response.raise_for_status()
predictions = response.json()
print(json.dumps(predictions, indent=2))
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -4,12 +4,14 @@ import logging
from flask import Flask, request, jsonify
from waitress import serve
from vidocp.utils import preprocess #TODO
from vidocp.utils import npconvert
from vidocp.utils.preprocessing import preprocess_pdf_image #TODO
from vidocp.table_parsing import parse_table#, detect_tables_in_pdf
from vidocp.redaction_detection import find_redactions#, detect_redactions_in_pdf
from vidocp.layout_parsing import parse_layout#, detect_layout_in_pdf #TODO
from vidocp.figure_detection import detect_figures#, detect_figures_in_pdf #TODO
from vidocp.utils.logging import logger
from vidocp.utils.preprocessing import open_pdf
from vidocp.config import CONFIG
@ -18,18 +20,18 @@ def suppress_user_warnings():
warnings.filterwarnings("ignore")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--warnings", action="store_true", default=False)
args = parser.parse_args()
# def parse_args():
# parser = argparse.ArgumentParser()
# parser.add_argument("--warnings", action="store_true", default=False)
# args = parser.parse_args()
return args
# return args
def main(args):
def main():
if not args.warnings:
suppress_user_warnings()
#if not args.warnings:
# suppress_user_warnings()
run_server()
@ -41,9 +43,16 @@ def run_server():
def predict_request():
def inner():
data = request.data
#print(type(request))
#print(dir(request))
params = request.json
#print("params:", params)
logger.info(f"<3 Received data.")
print("data type:", type(data))
#print("json type:", type(params))
logger.info(f"Processing data. <3")
predictions = make_predictions(data)
pdf_data = open_pdf(data)
predictions = make_predictions(pdf_data)
return jsonify({"result": predictions})
try:
return inner()
@ -60,22 +69,31 @@ def run_server():
return jsonify(response)
#predictor = initialize_predictor()
#logger.info("<3 Predictor ready.")
logger.info("<3 Annotator ready.")
mode = CONFIG.webserver.mode
if mode == "development":
app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True)
elif mode == "production":
serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
logging.info("Production.")
def make_predictions(pdf_data, page_index):
pdf = preprocess(pdf_data[page_index])
tables = parse_table(pdf)
redactions = find_redactions(pdf)
layout = parse_layout(pdf)
figure = detect_figures(pdf)
return jsonify({"tables": tables,
"redactions": redactions,
"layout": layout,
"figure": figure})
def make_predictions(pdf_data):
output = {}
pdf = open_pdf(pdf_data)
for i, page in enumerate(pdf):
page = preprocess_pdf_image(page)
tables = json.dumps(list(parse_table(page)), default=npconvert) #list() for consistency; not strictly necessary
redactions = json.dumps(list(find_redactions(page)), default=npconvert)
layout = json.dumps(list(parse_layout(page)), default=npconvert)
figure = json.dumps(list(detect_figures(page)), default=npconvert)
output.update({i: {"tables": tables,
"redactions": redactions,
"layout": layout,
"figure": figure}})
return output
if __name__ == "__main__":
main()

View File

@ -33,4 +33,7 @@ class Config:
return _get_item_and_maybe_make_dotindexable(self.__config, item)
def __getitem__(self, item):
return self.__getattr__(item)
return self.__getattr__(item)
CONFIG = Config(CONFIG_FILE)

View File

@ -28,7 +28,7 @@ def detect_figures(image: np.array):
return rects
def detect_figures_in_pdf(pdf_path, page_index=1):
def detect_figures_in_pdf(pdf_path, page_index=1, show=True):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
@ -36,4 +36,7 @@ def detect_figures_in_pdf(pdf_path, page_index=1):
redaction_contours = detect_figures(page)
page = draw_rectangles(page, redaction_contours)
show_mpl(page)
if show:
show_mpl(page)
else:
return page

View File

@ -7,10 +7,11 @@ from matplotlib import pyplot as plt
def find_layout_boxes(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1)
thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1]
img_bin = ~thresh
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.GaussianBlur(image, (5, 5), 1)
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY)[1]
img_bin = ~image
line_min_width = 10
kernel_h = np.ones((10, line_min_width), np.uint8)

View File

@ -31,10 +31,12 @@ def find_segments(image):
def parse_layout(image: np.array):
image = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (7, 7), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
image_ = image.copy()
if len(image_.shape) > 2:
image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2GRAY)
image_ = cv2.GaussianBlur(image_, (7, 7), 0)
thresh = cv2.threshold(image_, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
dilate = cv2.dilate(thresh, kernel, iterations=4)
@ -50,7 +52,8 @@ def parse_layout(image: np.array):
_, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY)
image = ~image
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = find_segments(image)
# <- End of meta detection
@ -60,12 +63,15 @@ def parse_layout(image: np.array):
return rects
def annotate_layout_in_pdf(pdf_path, page_index=1):
def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
rects = parse_layout(page)
page = draw_rectangles(page, rects)
show_mpl(page)
if show:
show_mpl(page)
else:
return page

View File

@ -18,7 +18,10 @@ def find_redactions(image: np.array, min_normalized_area=200000):
min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution
gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if len(image.shape) > 2:
gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = ~image
blurred = cv2.GaussianBlur(gray, (5, 5), 1)
thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]
@ -30,7 +33,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
return contours
def annotate_redactions_in_pdf(pdf_path, page_index=1):
def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
@ -38,4 +41,7 @@ def annotate_redactions_in_pdf(pdf_path, page_index=1):
redaction_contours = find_redactions(page)
page = draw_contours(page, redaction_contours)
show_mpl(page)
if show:
show_mpl(page)
else:
return page

View File

@ -8,12 +8,13 @@ from matplotlib import pyplot as plt
def parse(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#plt.imshow(gray_scale)
blurred = cv2.GaussianBlur(gray_scale, (7, 7), 2) #5 5 1
thresh = cv2.threshold(blurred, 251, 255, cv2.THRESH_BINARY)[1]
image = cv2.GaussianBlur(image, (7, 7), 2) #5 5 1
image = cv2.threshold(image, 251, 255, cv2.THRESH_BINARY)[1]
#plt.imshow(thresh)
img_bin = ~thresh
img_bin = ~image
line_min_width = 7
kernel_h = np.ones((10, line_min_width), np.uint8)
@ -37,9 +38,10 @@ def parse_tables(image: np.array, rects: list):
for rect in rects:
(x,y,w,h) = rect
region_of_interest = image[x:x+w, y:y+h]
gray = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)[1]
img_bin = ~thresh
if len(region_of_interest.shape) > 2:
region_of_interest = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
region_of_interest = cv2.threshold(region_of_interest, 200, 255, cv2.THRESH_BINARY)[1]
img_bin = ~region_of_interest
line_min_width = 5
kernel_h = np.ones((1, line_min_width), np.uint8)

View File

@ -26,14 +26,15 @@ def add_external_contours(image, img):
def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
def isolate_vertical_and_horizontal_components(img_bin, bounding_rects, show=False):
line_min_width = 48
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
show_mpl(img_bin_h | img_bin_v)
if show:
show_mpl(img_bin_h | img_bin_v)
kernel_h = np.ones((1, 30), np.uint8)
kernel_v = np.ones((30, 1), np.uint8)
@ -46,7 +47,8 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
img_bin_v = apply_motion_blur(img_bin_v, 80, 90)
img_bin_final = img_bin_h | img_bin_v
show_mpl(img_bin_final)
if show:
show_mpl(img_bin_final)
# changed threshold from 110 to 120 to minimize cell splitting
th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
@ -118,15 +120,17 @@ def find_table_layout_boxes(image: np.array):
return table_boxes
def parse_table(image: np.array):
def parse_table(image: np.array, show=False):
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 2000 and w > 35 and h > 25
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
th1, img_bin = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
show_mpl(img_bin)
if show:
show_mpl(img_bin)
table_layout_boxes = find_table_layout_boxes(image)
img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes)
@ -143,7 +147,7 @@ def parse_table(image: np.array):
return rects
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False):
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
if deskew:
@ -153,5 +157,8 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False):
page = draw_rectangles(page, stats, annotate=True)
# if stats:
# page = draw_rectangles(page, stats, annotate=True)
show_mpl(page)
if show:
show_mpl(page)
else:
return page

View File

@ -48,8 +48,9 @@ def annotate_image(image, stats):
def parse_table(image: np.array):
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)

View File

@ -8,9 +8,10 @@ def detect_angle_from_lines(im: np.array, max_skew_deg=10, min_skew_deg=0.1, min
min_skew_rad = np.deg2rad(min_skew_deg)
width = im.shape[1]
im_gs = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
im_gs = cv2.fastNlMeansDenoising(im_gs, h=3)
im_bw = cv2.threshold(im_gs, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
if len(im.shape) > 2:
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
im = cv2.fastNlMeansDenoising(im, h=3)
im_bw = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
lines = cv2.HoughLinesP(im_bw, 1, np.pi / 180, 200, minLineLength=width / 12, maxLineGap=width / 150)
@ -54,7 +55,8 @@ def deskew_linebased(image: np.array, verbose=False) -> np.array:
def deskew_histbased(page: np.array, preprocess=True, max_abs_angle=1.5, delta=0.15, mode="nearest", verbose=False):
if preprocess:
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
if len(page.shape) > 2:
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
page = cv2.fastNlMeansDenoising(page, h=3)
w, h = page.shape

View File

@ -8,12 +8,13 @@ def detect_large_coherent_structures(image: np.array):
References:
https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY)[1]
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
dilate = cv2.dilate(~image, dilate_kernel, iterations=4)
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)

View File

@ -130,7 +130,7 @@ def xywh_to_vecs(rect):
x1, y1, w, h = rect
x2 = x1 + w
y2 = y1 + h
return Rectangle(x1, y1, x2, y2)
return (x1, y1), (x2, y2)
def vec_rect_to_xywh(rect):

View File

@ -0,0 +1,23 @@
from numpy import array
import pdf2image
import cv2
def open_pdf(pdf, first_page=0, last_page=None):
first_page += 1
last_page = None if last_page is None else last_page + 1
if type(pdf) == str:
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
elif type(pdf) == bytes:
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
elif type(pdf) == list:
return pdf
pages = [array(p) for p in pages]
return pages
def preprocess_pdf_image(page):
if len(page.shape) > 2:
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
page = cv2.fastNlMeansDenoising(page, h=3)
return page

View File

@ -40,12 +40,13 @@ def find_primary_text_regions(image):
image = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
dilate = cv2.dilate(close, dilate_kernel, iterations=1)

View File

@ -1,3 +1,4 @@
from numpy import generic
import cv2
@ -10,3 +11,8 @@ def copy_and_normalize_channels(image):
pass
return image
def npconvert(ob):
if isinstance(ob, generic): return ob.item()
raise TypeError