first fully working containerization; still needs environment variables; review request data format

2022-03-08 10:01:25 +01:00 · 2022-03-08 10:01:25 +01:00 · 8b9621e798
commit 8b9621e798
parent 7784993d1f
19 changed files with 191 additions and 72 deletions
--- a/1
+++ b/1
@ -8,6 +8,7 @@ WORKDIR /app/service
 COPY ./src ./src
 COPY vidocp ./vidocp

+RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install -e .

 WORKDIR /app/service
--- a/7
+++ b/7
@ -12,7 +12,6 @@ WORKDIR /app/service
 COPY . ./

 # Install dependencies.
-RUN apt-get update && apt-get install -y python3-opencv
 RUN python3 -m pip install -r requirements.txt

 # Make a new container and copy all relevant files over to filter out temporary files
@ -23,4 +22,8 @@ WORKDIR /app/
 COPY --from=builder1  /app .
 ENV PATH="/app/venv/bin:$PATH"

-WORKDIR /app/service
+WORKDIR /app/service
+
+RUN apt update
+#RUN apt install python3-opencv-headless 
+RUN apt install poppler-utils --yes
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-opencv-python~=4.5.5.62
+opencv-python-headless~=4.5.5.62
 numpy~=1.22.1
 pdf2image~=1.16.0
 matplotlib~=3.5.1
--- a/scripts/client_mock.py
+++ b/scripts/client_mock.py
@ -0,0 +1,35 @@
+import argparse
+import json
+import requests
+
+from vidocp.utils.preprocessing import open_pdf
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pdf_path", required=True, help="path to PDF file")
+    parser.add_argument("--first_page", type=int, required=True, help="path to PDF file")
+    parser.add_argument("--last_page", type=int, required=False, default=None, help="path to PDF file")
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    
+    #data = open_pdf(args.pdf_path, args.first_page, args.last_page)
+    # params = json.dumps({
+    #     "pdf_path": "a",#args.pdf_path, 
+    #     "first_page": 4,#args.first_page, 
+    #     "last_page": 6#args.last_page
+    # })
+    response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))#, json=params)
+    response.raise_for_status()
+    predictions = response.json()
+
+    print(json.dumps(predictions, indent=2))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/src/run_service.py
+++ b/src/run_service.py
@ -4,12 +4,14 @@ import logging
 from flask import Flask, request, jsonify
 from waitress import serve

-from vidocp.utils import preprocess #TODO
+from vidocp.utils import npconvert
+from vidocp.utils.preprocessing import preprocess_pdf_image #TODO
 from vidocp.table_parsing import parse_table#, detect_tables_in_pdf
 from vidocp.redaction_detection import find_redactions#, detect_redactions_in_pdf
 from vidocp.layout_parsing import parse_layout#, detect_layout_in_pdf #TODO
 from vidocp.figure_detection import detect_figures#, detect_figures_in_pdf #TODO
 from vidocp.utils.logging import logger
+from vidocp.utils.preprocessing import open_pdf
 from vidocp.config import CONFIG


@ -18,18 +20,18 @@ def suppress_user_warnings():
    warnings.filterwarnings("ignore")


-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--warnings", action="store_true", default=False)
-    args = parser.parse_args()
+# def parse_args():
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--warnings", action="store_true", default=False)
+#     args = parser.parse_args()

-    return args
+#     return args


-def main(args):
+def main():

-    if not args.warnings:
-        suppress_user_warnings()
+    #if not args.warnings:
+    #    suppress_user_warnings()

    run_server()

@ -41,9 +43,16 @@ def run_server():
    def predict_request():
        def inner():
            data = request.data
+            #print(type(request))
+            #print(dir(request))
+            params = request.json
+            #print("params:", params)
            logger.info(f"<3 Received data.")
+            print("data type:", type(data))
+            #print("json type:", type(params))
            logger.info(f"Processing data. <3")
-            predictions = make_predictions(data)
+            pdf_data = open_pdf(data)
+            predictions = make_predictions(pdf_data)
            return jsonify({"result": predictions})
        try:
            return inner()
@ -60,22 +69,31 @@ def run_server():
        return jsonify(response)

    #predictor = initialize_predictor()
-    #logger.info("<3 Predictor ready.")
+    logger.info("<3 Annotator ready.")

    mode = CONFIG.webserver.mode
    if mode == "development":
        app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True)
    elif mode == "production":
        serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
+        logging.info("Production.")


-def make_predictions(pdf_data, page_index):
-    pdf = preprocess(pdf_data[page_index])
-    tables = parse_table(pdf)
-    redactions = find_redactions(pdf)
-    layout = parse_layout(pdf)
-    figure = detect_figures(pdf)
-    return jsonify({"tables": tables,
-                    "redactions": redactions,
-                    "layout": layout,
-                    "figure": figure})
+def make_predictions(pdf_data):
+    output = {}
+    pdf = open_pdf(pdf_data)
+    for i, page in enumerate(pdf):
+        page = preprocess_pdf_image(page)
+        tables = json.dumps(list(parse_table(page)), default=npconvert) #list() for consistency; not strictly necessary
+        redactions = json.dumps(list(find_redactions(page)), default=npconvert)
+        layout = json.dumps(list(parse_layout(page)), default=npconvert)
+        figure = json.dumps(list(detect_figures(page)), default=npconvert)
+        output.update({i: {"tables": tables,
+                           "redactions": redactions,
+                           "layout": layout,
+                           "figure": figure}})
+    return output
+
+
+if __name__ == "__main__":
+    main()
--- a/vidocp/config.py
+++ b/vidocp/config.py
@ -33,4 +33,7 @@ class Config:
            return _get_item_and_maybe_make_dotindexable(self.__config, item)

    def __getitem__(self, item):
-        return self.__getattr__(item)
+        return self.__getattr__(item)
+
+
+CONFIG = Config(CONFIG_FILE)
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@ -28,7 +28,7 @@ def detect_figures(image: np.array):
    return rects


-def detect_figures_in_pdf(pdf_path, page_index=1):
+def detect_figures_in_pdf(pdf_path, page_index=1, show=True):

    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)
@ -36,4 +36,7 @@ def detect_figures_in_pdf(pdf_path, page_index=1):
    redaction_contours = detect_figures(page)
    page = draw_rectangles(page, redaction_contours)

-    show_mpl(page)
+    if show:
+        show_mpl(page)
+    else:
+        return page
--- a/vidocp/layout_detection.py
+++ b/vidocp/layout_detection.py
@ -7,10 +7,11 @@ from matplotlib import pyplot as plt

 def find_layout_boxes(image: np.array):

-    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1)
-    thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1]
-    img_bin = ~thresh
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    image = cv2.GaussianBlur(image, (5, 5), 1)
+    image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY)[1]
+    img_bin = ~image

    line_min_width = 10
    kernel_h = np.ones((10, line_min_width), np.uint8)
--- a/vidocp/layout_parsing.py
+++ b/vidocp/layout_parsing.py
@ -31,10 +31,12 @@ def find_segments(image):
 def parse_layout(image: np.array):

    image = image.copy()
-
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    blur = cv2.GaussianBlur(gray, (7, 7), 0)
-    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    image_ = image.copy()
+    
+    if len(image_.shape) > 2:
+        image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2GRAY)
+    image_ = cv2.GaussianBlur(image_, (7, 7), 0)
+    thresh = cv2.threshold(image_, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=4)
@ -50,7 +52,8 @@ def parse_layout(image: np.array):
    _, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY)
    image = ~image

-    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    rects = find_segments(image)
    # <- End of meta detection

@ -60,12 +63,15 @@ def parse_layout(image: np.array):
    return rects


-def annotate_layout_in_pdf(pdf_path, page_index=1):
+def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):

    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)

    rects = parse_layout(page)
    page = draw_rectangles(page, rects)
-
-    show_mpl(page)
+    
+    if show:
+        show_mpl(page)
+    else:
+        return page
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@ -18,7 +18,10 @@ def find_redactions(image: np.array, min_normalized_area=200000):

    min_normalized_area /= 200  # Assumes 200 DPI PDF -> image conversion resolution

-    gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    if len(image.shape) > 2:
+        gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    else:
+        gray = ~image
    blurred = cv2.GaussianBlur(gray, (5, 5), 1)
    thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]

@ -30,7 +33,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
    return contours


-def annotate_redactions_in_pdf(pdf_path, page_index=1):
+def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):

    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)
@ -38,4 +41,7 @@ def annotate_redactions_in_pdf(pdf_path, page_index=1):
    redaction_contours = find_redactions(page)
    page = draw_contours(page, redaction_contours)

-    show_mpl(page)
+    if show:
+        show_mpl(page)
+    else:
+        return page
--- a/vidocp/table_parsig.py
+++ b/vidocp/table_parsig.py
@ -8,12 +8,13 @@ from matplotlib import pyplot as plt


 def parse(image: np.array):
-    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #plt.imshow(gray_scale)
-    blurred = cv2.GaussianBlur(gray_scale, (7, 7), 2)  #5 5 1
-    thresh = cv2.threshold(blurred, 251, 255, cv2.THRESH_BINARY)[1]
+    image = cv2.GaussianBlur(image, (7, 7), 2)  #5 5 1
+    image = cv2.threshold(image, 251, 255, cv2.THRESH_BINARY)[1]
    #plt.imshow(thresh)
-    img_bin = ~thresh
+    img_bin = ~image

    line_min_width = 7
    kernel_h = np.ones((10, line_min_width), np.uint8)
@ -37,9 +38,10 @@ def parse_tables(image: np.array, rects: list):
    for rect in rects:
        (x,y,w,h) = rect
        region_of_interest = image[x:x+w, y:y+h]
-        gray = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
-        thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)[1]
-        img_bin = ~thresh
+        if len(region_of_interest.shape) > 2:
+            region_of_interest = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
+        region_of_interest = cv2.threshold(region_of_interest, 200, 255, cv2.THRESH_BINARY)[1]
+        img_bin = ~region_of_interest

        line_min_width = 5
        kernel_h = np.ones((1, line_min_width), np.uint8)
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@ -26,14 +26,15 @@ def add_external_contours(image, img):



-def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
+def isolate_vertical_and_horizontal_components(img_bin, bounding_rects, show=False):
    line_min_width = 48
    kernel_h = np.ones((1, line_min_width), np.uint8)
    kernel_v = np.ones((line_min_width, 1), np.uint8)

    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-    show_mpl(img_bin_h | img_bin_v)
+    if show:
+        show_mpl(img_bin_h | img_bin_v)

    kernel_h = np.ones((1, 30), np.uint8)
    kernel_v = np.ones((30, 1), np.uint8)
@ -46,7 +47,8 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
    img_bin_v = apply_motion_blur(img_bin_v, 80, 90)

    img_bin_final = img_bin_h | img_bin_v
-    show_mpl(img_bin_final)
+    if show:
+        show_mpl(img_bin_final)
    # changed threshold from 110 to 120 to minimize cell splitting
    th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
    img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
@ -118,15 +120,17 @@ def find_table_layout_boxes(image: np.array):
    return table_boxes


-def parse_table(image: np.array):
+def parse_table(image: np.array, show=False):
    def is_large_enough(stat):
        x1, y1, w, h, area = stat
        return area > 2000 and w > 35 and h > 25

-    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
-    th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
+    th1, img_bin = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
    img_bin = ~img_bin
-    show_mpl(img_bin)
+    if show:
+        show_mpl(img_bin)

    table_layout_boxes = find_table_layout_boxes(image)
    img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes)
@ -143,7 +147,7 @@ def parse_table(image: np.array):
    return rects


-def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False):
+def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):
    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)
    if deskew:
@ -153,5 +157,8 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False):
    page = draw_rectangles(page, stats, annotate=True)
    # if stats:
    #     page = draw_rectangles(page, stats, annotate=True)
-
-    show_mpl(page)
+    
+    if show:
+        show_mpl(page)
+    else:
+        return page
--- a/vidocp/table_parsing_2.py
+++ b/vidocp/table_parsing_2.py
@ -48,8 +48,9 @@ def annotate_image(image, stats):

 def parse_table(image: np.array):

-    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    th1, img_bin = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
    img_bin = ~img_bin

    img_bin = isolate_vertical_and_horizontal_components(img_bin)
--- a/vidocp/utils/deskew.py
+++ b/vidocp/utils/deskew.py
@ -8,9 +8,10 @@ def detect_angle_from_lines(im: np.array, max_skew_deg=10, min_skew_deg=0.1, min
    min_skew_rad = np.deg2rad(min_skew_deg)
    width = im.shape[1]

-    im_gs = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
-    im_gs = cv2.fastNlMeansDenoising(im_gs, h=3)
-    im_bw = cv2.threshold(im_gs, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
+    if len(im.shape) > 2:
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+    im = cv2.fastNlMeansDenoising(im, h=3)
+    im_bw = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

    lines = cv2.HoughLinesP(im_bw, 1, np.pi / 180, 200, minLineLength=width / 12, maxLineGap=width / 150)

@ -54,7 +55,8 @@ def deskew_linebased(image: np.array, verbose=False) -> np.array:

 def deskew_histbased(page: np.array, preprocess=True, max_abs_angle=1.5, delta=0.15, mode="nearest", verbose=False):
    if preprocess:
-        page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
+        if len(page.shape) > 2:
+            page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
        page = cv2.fastNlMeansDenoising(page, h=3)
    w, h = page.shape

--- a/vidocp/utils/detection.py
+++ b/vidocp/utils/detection.py
@ -8,12 +8,13 @@ def detect_large_coherent_structures(image: np.array):
    References:
         https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
    """
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

-    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
+    image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY)[1]

    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
-    dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
+    dilate = cv2.dilate(~image, dilate_kernel, iterations=4)

    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
    close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@ -130,7 +130,7 @@ def xywh_to_vecs(rect):
    x1, y1, w, h = rect
    x2 = x1 + w
    y2 = y1 + h
-    return Rectangle(x1, y1, x2, y2)
+    return (x1, y1), (x2, y2)


 def vec_rect_to_xywh(rect):
--- a/vidocp/utils/preprocessing.py
+++ b/vidocp/utils/preprocessing.py
@ -0,0 +1,23 @@
+from numpy import array
+import pdf2image
+import cv2
+
+
+def open_pdf(pdf, first_page=0, last_page=None):
+    first_page += 1
+    last_page = None if last_page is None else last_page + 1
+    if type(pdf) == str:
+        pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
+    elif type(pdf) == bytes:
+        pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
+    elif type(pdf) == list:
+        return pdf
+    pages = [array(p) for p in pages]
+    return pages
+
+
+def preprocess_pdf_image(page):
+    if len(page.shape) > 2:
+        page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
+    page = cv2.fastNlMeansDenoising(page, h=3)
+    return page
--- a/vidocp/utils/text.py
+++ b/vidocp/utils/text.py
@ -40,12 +40,13 @@ def find_primary_text_regions(image):

    image = image.copy()

-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

-    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
-    close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
+    close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)

    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
    dilate = cv2.dilate(close, dilate_kernel, iterations=1)
--- a/vidocp/utils/utils.py
+++ b/vidocp/utils/utils.py
@ -1,3 +1,4 @@
+from numpy import generic
 import cv2


@ -10,3 +11,8 @@ def copy_and_normalize_channels(image):
        pass

    return image
+
+
+def npconvert(ob):
+    if isinstance(ob, generic): return ob.item()  
+    raise TypeError