From 8b9621e798ed60d380eac5d54b8fe56b39324904 Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Tue, 8 Mar 2022 10:01:25 +0100 Subject: [PATCH] first fully working containerization; still needs environment variables; review request data format --- Dockerfile | 1 + Dockerfile_base | 7 ++-- requirements.txt | 2 +- scripts/client_mock.py | 35 +++++++++++++++++++ src/run_service.py | 60 +++++++++++++++++++++------------ vidocp/config.py | 5 ++- vidocp/figure_detection.py | 7 ++-- vidocp/layout_detection.py | 9 ++--- vidocp/layout_parsing.py | 22 +++++++----- vidocp/redaction_detection.py | 12 +++++-- vidocp/table_parsig.py | 16 +++++---- vidocp/table_parsing.py | 27 +++++++++------ vidocp/table_parsing_2.py | 5 +-- vidocp/utils/deskew.py | 10 +++--- vidocp/utils/detection.py | 7 ++-- vidocp/utils/post_processing.py | 2 +- vidocp/utils/preprocessing.py | 23 +++++++++++++ vidocp/utils/text.py | 7 ++-- vidocp/utils/utils.py | 6 ++++ 19 files changed, 191 insertions(+), 72 deletions(-) create mode 100644 vidocp/utils/preprocessing.py diff --git a/Dockerfile b/Dockerfile index 076315f..98abde6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,7 @@ WORKDIR /app/service COPY ./src ./src COPY vidocp ./vidocp +RUN python3 -m pip install --upgrade pip RUN python3 -m pip install -e . WORKDIR /app/service diff --git a/Dockerfile_base b/Dockerfile_base index 18a255b..b564b73 100644 --- a/Dockerfile_base +++ b/Dockerfile_base @@ -12,7 +12,6 @@ WORKDIR /app/service COPY . ./ # Install dependencies. -RUN apt-get update && apt-get install -y python3-opencv RUN python3 -m pip install -r requirements.txt # Make a new container and copy all relevant files over to filter out temporary files @@ -23,4 +22,8 @@ WORKDIR /app/ COPY --from=builder1 /app . ENV PATH="/app/venv/bin:$PATH" -WORKDIR /app/service \ No newline at end of file +WORKDIR /app/service + +RUN apt update +#RUN apt install python3-opencv-headless +RUN apt install poppler-utils --yes \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bf6d0be..2d95184 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -opencv-python~=4.5.5.62 +opencv-python-headless~=4.5.5.62 numpy~=1.22.1 pdf2image~=1.16.0 matplotlib~=3.5.1 diff --git a/scripts/client_mock.py b/scripts/client_mock.py index e69de29..d1e258d 100644 --- a/scripts/client_mock.py +++ b/scripts/client_mock.py @@ -0,0 +1,35 @@ +import argparse +import json +import requests + +from vidocp.utils.preprocessing import open_pdf + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--pdf_path", required=True, help="path to PDF file") + parser.add_argument("--first_page", type=int, required=True, help="path to PDF file") + parser.add_argument("--last_page", type=int, required=False, default=None, help="path to PDF file") + args = parser.parse_args() + + return args + + +def main(args): + + #data = open_pdf(args.pdf_path, args.first_page, args.last_page) + # params = json.dumps({ + # "pdf_path": "a",#args.pdf_path, + # "first_page": 4,#args.first_page, + # "last_page": 6#args.last_page + # }) + response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))#, json=params) + response.raise_for_status() + predictions = response.json() + + print(json.dumps(predictions, indent=2)) + + +if __name__ == "__main__": + args = parse_args() + main(args) \ No newline at end of file diff --git a/src/run_service.py b/src/run_service.py index 5d0d9be..0cf606f 100644 --- a/src/run_service.py +++ b/src/run_service.py @@ -4,12 +4,14 @@ import logging from flask import Flask, request, jsonify from waitress import serve -from vidocp.utils import preprocess #TODO +from vidocp.utils import npconvert +from vidocp.utils.preprocessing import preprocess_pdf_image #TODO from vidocp.table_parsing import parse_table#, detect_tables_in_pdf from vidocp.redaction_detection import find_redactions#, detect_redactions_in_pdf from vidocp.layout_parsing import parse_layout#, detect_layout_in_pdf #TODO from vidocp.figure_detection import detect_figures#, detect_figures_in_pdf #TODO from vidocp.utils.logging import logger +from vidocp.utils.preprocessing import open_pdf from vidocp.config import CONFIG @@ -18,18 +20,18 @@ def suppress_user_warnings(): warnings.filterwarnings("ignore") -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--warnings", action="store_true", default=False) - args = parser.parse_args() +# def parse_args(): +# parser = argparse.ArgumentParser() +# parser.add_argument("--warnings", action="store_true", default=False) +# args = parser.parse_args() - return args +# return args -def main(args): +def main(): - if not args.warnings: - suppress_user_warnings() + #if not args.warnings: + # suppress_user_warnings() run_server() @@ -41,9 +43,16 @@ def run_server(): def predict_request(): def inner(): data = request.data + #print(type(request)) + #print(dir(request)) + params = request.json + #print("params:", params) logger.info(f"<3 Received data.") + print("data type:", type(data)) + #print("json type:", type(params)) logger.info(f"Processing data. <3") - predictions = make_predictions(data) + pdf_data = open_pdf(data) + predictions = make_predictions(pdf_data) return jsonify({"result": predictions}) try: return inner() @@ -60,22 +69,31 @@ def run_server(): return jsonify(response) #predictor = initialize_predictor() - #logger.info("<3 Predictor ready.") + logger.info("<3 Annotator ready.") mode = CONFIG.webserver.mode if mode == "development": app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True) elif mode == "production": serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port) + logging.info("Production.") -def make_predictions(pdf_data, page_index): - pdf = preprocess(pdf_data[page_index]) - tables = parse_table(pdf) - redactions = find_redactions(pdf) - layout = parse_layout(pdf) - figure = detect_figures(pdf) - return jsonify({"tables": tables, - "redactions": redactions, - "layout": layout, - "figure": figure}) +def make_predictions(pdf_data): + output = {} + pdf = open_pdf(pdf_data) + for i, page in enumerate(pdf): + page = preprocess_pdf_image(page) + tables = json.dumps(list(parse_table(page)), default=npconvert) #list() for consistency; not strictly necessary + redactions = json.dumps(list(find_redactions(page)), default=npconvert) + layout = json.dumps(list(parse_layout(page)), default=npconvert) + figure = json.dumps(list(detect_figures(page)), default=npconvert) + output.update({i: {"tables": tables, + "redactions": redactions, + "layout": layout, + "figure": figure}}) + return output + + +if __name__ == "__main__": + main() diff --git a/vidocp/config.py b/vidocp/config.py index eaf36ce..b2b7b84 100644 --- a/vidocp/config.py +++ b/vidocp/config.py @@ -33,4 +33,7 @@ class Config: return _get_item_and_maybe_make_dotindexable(self.__config, item) def __getitem__(self, item): - return self.__getattr__(item) \ No newline at end of file + return self.__getattr__(item) + + +CONFIG = Config(CONFIG_FILE) \ No newline at end of file diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 27a8eb2..313ddef 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -28,7 +28,7 @@ def detect_figures(image: np.array): return rects -def detect_figures_in_pdf(pdf_path, page_index=1): +def detect_figures_in_pdf(pdf_path, page_index=1, show=True): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) @@ -36,4 +36,7 @@ def detect_figures_in_pdf(pdf_path, page_index=1): redaction_contours = detect_figures(page) page = draw_rectangles(page, redaction_contours) - show_mpl(page) + if show: + show_mpl(page) + else: + return page diff --git a/vidocp/layout_detection.py b/vidocp/layout_detection.py index 1d49684..2014f90 100644 --- a/vidocp/layout_detection.py +++ b/vidocp/layout_detection.py @@ -7,10 +7,11 @@ from matplotlib import pyplot as plt def find_layout_boxes(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) - thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] - img_bin = ~thresh + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + image = cv2.GaussianBlur(image, (5, 5), 1) + image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY)[1] + img_bin = ~image line_min_width = 10 kernel_h = np.ones((10, line_min_width), np.uint8) diff --git a/vidocp/layout_parsing.py b/vidocp/layout_parsing.py index b5f1c51..b0691c2 100644 --- a/vidocp/layout_parsing.py +++ b/vidocp/layout_parsing.py @@ -31,10 +31,12 @@ def find_segments(image): def parse_layout(image: np.array): image = image.copy() - - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - blur = cv2.GaussianBlur(gray, (7, 7), 0) - thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + image_ = image.copy() + + if len(image_.shape) > 2: + image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2GRAY) + image_ = cv2.GaussianBlur(image_, (7, 7), 0) + thresh = cv2.threshold(image_, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) dilate = cv2.dilate(thresh, kernel, iterations=4) @@ -50,7 +52,8 @@ def parse_layout(image: np.array): _, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY) image = ~image - image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) rects = find_segments(image) # <- End of meta detection @@ -60,12 +63,15 @@ def parse_layout(image: np.array): return rects -def annotate_layout_in_pdf(pdf_path, page_index=1): +def annotate_layout_in_pdf(pdf_path, page_index=1, show=False): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) rects = parse_layout(page) page = draw_rectangles(page, rects) - - show_mpl(page) + + if show: + show_mpl(page) + else: + return page diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index 3362dc6..588be2b 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -18,7 +18,10 @@ def find_redactions(image: np.array, min_normalized_area=200000): min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution - gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + if len(image.shape) > 2: + gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = ~image blurred = cv2.GaussianBlur(gray, (5, 5), 1) thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1] @@ -30,7 +33,7 @@ def find_redactions(image: np.array, min_normalized_area=200000): return contours -def annotate_redactions_in_pdf(pdf_path, page_index=1): +def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) @@ -38,4 +41,7 @@ def annotate_redactions_in_pdf(pdf_path, page_index=1): redaction_contours = find_redactions(page) page = draw_contours(page, redaction_contours) - show_mpl(page) + if show: + show_mpl(page) + else: + return page diff --git a/vidocp/table_parsig.py b/vidocp/table_parsig.py index 099830e..2fe7c35 100644 --- a/vidocp/table_parsig.py +++ b/vidocp/table_parsig.py @@ -8,12 +8,13 @@ from matplotlib import pyplot as plt def parse(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #plt.imshow(gray_scale) - blurred = cv2.GaussianBlur(gray_scale, (7, 7), 2) #5 5 1 - thresh = cv2.threshold(blurred, 251, 255, cv2.THRESH_BINARY)[1] + image = cv2.GaussianBlur(image, (7, 7), 2) #5 5 1 + image = cv2.threshold(image, 251, 255, cv2.THRESH_BINARY)[1] #plt.imshow(thresh) - img_bin = ~thresh + img_bin = ~image line_min_width = 7 kernel_h = np.ones((10, line_min_width), np.uint8) @@ -37,9 +38,10 @@ def parse_tables(image: np.array, rects: list): for rect in rects: (x,y,w,h) = rect region_of_interest = image[x:x+w, y:y+h] - gray = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY) - thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)[1] - img_bin = ~thresh + if len(region_of_interest.shape) > 2: + region_of_interest = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY) + region_of_interest = cv2.threshold(region_of_interest, 200, 255, cv2.THRESH_BINARY)[1] + img_bin = ~region_of_interest line_min_width = 5 kernel_h = np.ones((1, line_min_width), np.uint8) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index f6801ca..97df384 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -26,14 +26,15 @@ def add_external_contours(image, img): -def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): +def isolate_vertical_and_horizontal_components(img_bin, bounding_rects, show=False): line_min_width = 48 kernel_h = np.ones((1, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 1), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - show_mpl(img_bin_h | img_bin_v) + if show: + show_mpl(img_bin_h | img_bin_v) kernel_h = np.ones((1, 30), np.uint8) kernel_v = np.ones((30, 1), np.uint8) @@ -46,7 +47,8 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): img_bin_v = apply_motion_blur(img_bin_v, 80, 90) img_bin_final = img_bin_h | img_bin_v - show_mpl(img_bin_final) + if show: + show_mpl(img_bin_final) # changed threshold from 110 to 120 to minimize cell splitting th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY) img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1) @@ -118,15 +120,17 @@ def find_table_layout_boxes(image: np.array): return table_boxes -def parse_table(image: np.array): +def parse_table(image: np.array, show=False): def is_large_enough(stat): x1, y1, w, h, area = stat return area > 2000 and w > 35 and h > 25 - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image - th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image + th1, img_bin = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin - show_mpl(img_bin) + if show: + show_mpl(img_bin) table_layout_boxes = find_table_layout_boxes(image) img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes) @@ -143,7 +147,7 @@ def parse_table(image: np.array): return rects -def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False): +def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) if deskew: @@ -153,5 +157,8 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False): page = draw_rectangles(page, stats, annotate=True) # if stats: # page = draw_rectangles(page, stats, annotate=True) - - show_mpl(page) + + if show: + show_mpl(page) + else: + return page diff --git a/vidocp/table_parsing_2.py b/vidocp/table_parsing_2.py index 8b035bf..d8f58c8 100644 --- a/vidocp/table_parsing_2.py +++ b/vidocp/table_parsing_2.py @@ -48,8 +48,9 @@ def annotate_image(image, stats): def parse_table(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY) img_bin = ~img_bin img_bin = isolate_vertical_and_horizontal_components(img_bin) diff --git a/vidocp/utils/deskew.py b/vidocp/utils/deskew.py index 4770bd9..727ccec 100644 --- a/vidocp/utils/deskew.py +++ b/vidocp/utils/deskew.py @@ -8,9 +8,10 @@ def detect_angle_from_lines(im: np.array, max_skew_deg=10, min_skew_deg=0.1, min min_skew_rad = np.deg2rad(min_skew_deg) width = im.shape[1] - im_gs = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) - im_gs = cv2.fastNlMeansDenoising(im_gs, h=3) - im_bw = cv2.threshold(im_gs, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] + if len(im.shape) > 2: + im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) + im = cv2.fastNlMeansDenoising(im, h=3) + im_bw = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] lines = cv2.HoughLinesP(im_bw, 1, np.pi / 180, 200, minLineLength=width / 12, maxLineGap=width / 150) @@ -54,7 +55,8 @@ def deskew_linebased(image: np.array, verbose=False) -> np.array: def deskew_histbased(page: np.array, preprocess=True, max_abs_angle=1.5, delta=0.15, mode="nearest", verbose=False): if preprocess: - page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY) + if len(page.shape) > 2: + page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY) page = cv2.fastNlMeansDenoising(page, h=3) w, h = page.shape diff --git a/vidocp/utils/detection.py b/vidocp/utils/detection.py index e5d8266..2df75a2 100644 --- a/vidocp/utils/detection.py +++ b/vidocp/utils/detection.py @@ -8,12 +8,13 @@ def detect_large_coherent_structures(image: np.array): References: https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection """ - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] + image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY)[1] dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) - dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) + dilate = cv2.dilate(~image, dilate_kernel, iterations=4) close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 86bd37f..a85b4dc 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -130,7 +130,7 @@ def xywh_to_vecs(rect): x1, y1, w, h = rect x2 = x1 + w y2 = y1 + h - return Rectangle(x1, y1, x2, y2) + return (x1, y1), (x2, y2) def vec_rect_to_xywh(rect): diff --git a/vidocp/utils/preprocessing.py b/vidocp/utils/preprocessing.py new file mode 100644 index 0000000..2aee52d --- /dev/null +++ b/vidocp/utils/preprocessing.py @@ -0,0 +1,23 @@ +from numpy import array +import pdf2image +import cv2 + + +def open_pdf(pdf, first_page=0, last_page=None): + first_page += 1 + last_page = None if last_page is None else last_page + 1 + if type(pdf) == str: + pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page) + elif type(pdf) == bytes: + pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page) + elif type(pdf) == list: + return pdf + pages = [array(p) for p in pages] + return pages + + +def preprocess_pdf_image(page): + if len(page.shape) > 2: + page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY) + page = cv2.fastNlMeansDenoising(page, h=3) + return page \ No newline at end of file diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py index 4189005..acfaa48 100644 --- a/vidocp/utils/text.py +++ b/vidocp/utils/text.py @@ -40,12 +40,13 @@ def find_primary_text_regions(image): image = image.copy() - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + if len(image.shape) > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) - close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) + close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1) dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) dilate = cv2.dilate(close, dilate_kernel, iterations=1) diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py index 18c8eb2..72dd99f 100644 --- a/vidocp/utils/utils.py +++ b/vidocp/utils/utils.py @@ -1,3 +1,4 @@ +from numpy import generic import cv2 @@ -10,3 +11,8 @@ def copy_and_normalize_channels(image): pass return image + + +def npconvert(ob): + if isinstance(ob, generic): return ob.item() + raise TypeError \ No newline at end of file