first working version with new API

2022-03-14 21:26:49 +01:00 · 2022-03-14 21:26:49 +01:00 · a089fa5e42
commit a089fa5e42
parent 8cd8c1b1f0
19 changed files with 173 additions and 423 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,6 +15,7 @@ build_venv/
 /results/
 /data
 /table_parsing.egg-info
+/target/
 /tests/VV-313450.pdf
 /vidocp.egg-info/dependency_links.txt
 /vidocp.egg-info/PKG-INFO
--- a/config.yaml
+++ b/config.yaml
@ -6,4 +6,12 @@ service:
 webserver:
  host: $SERVER_HOST|"127.0.0.1"  # webserver address
  port: $SERVER_PORT|5000  # webserver port
-  mode: $SERVER_MODE|production  # webserver mode: {development, production}
+  mode: $SERVER_MODE|production  # webserver mode: {development, production}
+
+deskew:
+  preprocess: True
+  max_abs_angle: 1.5
+  delta: 0.15
+  mode: nearest 
+  verbose: False
+  filter_strength_h: 3
--- a/scripts/client_mock.py
+++ b/scripts/client_mock.py
@ -1,3 +1,4 @@
+# python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing
 import argparse
 import json
 import requests
@ -8,22 +9,38 @@ from vidocp.utils.preprocessing import open_pdf
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdf_path", required=True, help="path to PDF file")
-    parser.add_argument("--first_page", type=int, required=True, help="path to PDF file")
-    parser.add_argument("--last_page", type=int, required=False, default=None, help="path to PDF file")
+    parser.add_argument(
+        "--first_page", type=int, required=False, default=0, help="page number from which to start (starts at 0)"
+    )
+    parser.add_argument(
+        "--last_page",
+        type=int,
+        required=False,
+        default=None,
+        help="page number at which to stop (non-inclusive); specify None to go to the end",
+    )
+    parser.add_argument(
+        "--operations",
+        type=str,
+        required=False,
+        help="Comma-separated list of operations, any of the following: \ntable-parsing\nredaction-detection\
+            \nfigure-detection\nlayout-detection", 
+        default="table-parsing"
+    )
    args = parser.parse_args()

    return args


 def main(args):
-    
-    #data = open_pdf(args.pdf_path, args.first_page, args.last_page)
-    # params = json.dumps({
-    #     "pdf_path": "a",#args.pdf_path, 
-    #     "first_page": 4,#args.first_page, 
-    #     "last_page": 6#args.last_page
-    # })
-    response = requests.post("http://127.0.0.1:5000", data=open(args.pdf_path, "rb"))#, json=params)
+    # files = {"name": (
+    #     "name", 
+    #     open(args.pdf_path, "rb"),
+    #     "file object corresponding to pdf file",
+    #     {"operations": args.operations.split(",")}
+    #     )
+    # }
+    response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
    response.raise_for_status()
    predictions = response.json()

@ -32,4 +49,4 @@ def main(args):

 if __name__ == "__main__":
    args = parse_args()
-    main(args)
+    main(args)
--- a/src/run_service.py
+++ b/src/run_service.py
@ -1,15 +1,16 @@
 import argparse
 import json
 import logging
+from typing import List
 from flask import Flask, request, jsonify
 from waitress import serve

 from vidocp.utils import npconvert
-from vidocp.utils.preprocessing import preprocess_pdf_image #TODO
-from vidocp.table_parsing import parse_table#, detect_tables_in_pdf
-from vidocp.redaction_detection import find_redactions#, detect_redactions_in_pdf
-from vidocp.layout_parsing import parse_layout#, detect_layout_in_pdf #TODO
-from vidocp.figure_detection import detect_figures#, detect_figures_in_pdf #TODO
+from vidocp.utils.preprocessing import preprocess_pdf_image  # TODO
+from vidocp.table_parsing import parse_table  # , detect_tables_in_pdf
+from vidocp.redaction_detection import find_redactions  # , detect_redactions_in_pdf
+from vidocp.layout_parsing import parse_layout  # , detect_layout_in_pdf #TODO
+from vidocp.figure_detection import detect_figures  # , detect_figures_in_pdf #TODO
 from vidocp.utils.logging import logger
 from vidocp.utils.preprocessing import open_pdf
 from vidocp.config import CONFIG
@ -17,58 +18,39 @@ from vidocp.config import CONFIG

 def suppress_user_warnings():
    import warnings
+
    warnings.filterwarnings("ignore")


-# def parse_args():
-#     parser = argparse.ArgumentParser()
-#     parser.add_argument("--warnings", action="store_true", default=False)
-#     args = parser.parse_args()
-
-#     return args
-
-
 def main():
-
-    #if not args.warnings:
-    #    suppress_user_warnings()
-
    run_server()


 def run_server():
    app = Flask(__name__)

-    @app.route("/", methods=["POST"])
-    def predict_request():
-        def inner():
-            data = request.data
-            #print(type(request))
-            #print(dir(request))
-            params = request.json
-            #print("params:", params)
-            logger.info(f"<3 Received data.")
-            print("data type:", type(data))
-            #print("json type:", type(params))
-            logger.info(f"Processing data. <3")
-            pdf_data = open_pdf(data)
-            predictions = make_predictions(pdf_data)
-            return jsonify({"result": predictions})
-        try:
-            return inner()
-        except Exception as err:
-            logger.warning("Analysis failed")
-            logger.exception(err)
-            resp = jsonify("Analysis failed")
-            resp.status_code = 500
-            return resp
+    @app.route("/tables", methods=["POST"])
+    def get_tables():
+        return annotate("tables")
+
+    @app.route("/redactions", methods=["POST"])
+    def get_redactions():
+        return annotate("redactions")
+
+    @app.route("/figures", methods=["POST"])
+    def get_figures():
+        return annotate("figures")
+    
+    @app.route("/layout", methods=["POST"])
+    def get_layout():
+        return annotate("layout")

    @app.route("/status", methods=["GET"])
    def status():
        response = "OK"
        return jsonify(response)

-    #predictor = initialize_predictor()
+    # predictor = initialize_predictor()
    logger.info("<3 Annotator ready.")

    mode = CONFIG.webserver.mode
@ -79,21 +61,49 @@ def run_server():
        logging.info("Production.")


-def make_predictions(pdf_data):
-    output = {}
-    pdf = open_pdf(pdf_data)
-    for i, page in enumerate(pdf):
-        page = preprocess_pdf_image(page)
-        tables = json.dumps(list(parse_table(page)), default=npconvert) #list() for consistency; not strictly necessary
-        redactions = json.dumps(list(find_redactions(page)), default=npconvert)
-        layout = json.dumps(list(parse_layout(page)), default=npconvert)
-        figure = json.dumps(list(detect_figures(page)), default=npconvert)
-        output.update({i: {"tables": tables,
-                           "redactions": redactions,
-                           "layout": layout,
-                           "figure": figure}})
-    return output
+def apply_annotation_function(annotation_function, page_list):
+    outdict = {}
+    for i, page in enumerate(page_list):
+        results = annotation_function(page)
+        if results:
+            outdict.update({i: results})
+    return outdict


+def make_annotations(pdf_data, task):
+    pdf = open_pdf(pdf_data)
+    
+    if task == "tables":
+        annotation = {"tables": apply_annotation_function(parse_table, pdf)}
+    elif task == "redactions":
+        annotation = {"redactions": apply_annotation_function(find_redactions, pdf)}
+    elif task == "figures":
+        annotation =  {"figures": apply_annotation_function(detect_figures, pdf)}
+    elif task == "layout":
+        annotation = {"layout": apply_annotation_function(parse_layout, pdf)}  
+    else:
+        raise ValueError(f"'{task}' is not a valid operation keyword. Valid values include: \
+            \ntables\nredactions\nfigures\nlayout\n")
+    
+    return json.dumps(annotation, default=npconvert)
+
+
+def annotate(task):
+    def inner():
+            data = request.data
+            logger.info(f"<3 Received data.")
+            logger.info(f"Processing data. <3")
+            annotations = make_annotations(data, task)
+            return jsonify({"result": annotations})
+    try:
+        return inner()
+    except Exception as err:
+        logger.warning("Analysis failed")
+        logger.exception(err)
+        resp = jsonify("Analysis failed")
+        resp.status_code = 500
+        return resp
+    
+
 if __name__ == "__main__":
    main()
--- a/tests/test_table_parsing.py
+++ b/tests/test_table_parsing.py
@ -21,9 +21,8 @@ def test_num_of_rects(rects):
 def test_range_of_rects(rects):
    expected_range = ((210, 605), (1430, 1620))
    topleft = min(rects)
-    x,y,w,h = max(rects)
-    bottomright = (x+w, y+h)
+    x, y, w, h = max(rects)
+    bottomright = (x + w, y + h)

    assert topleft >= expected_range[0]
    assert bottomright <= expected_range[1]
-
--- a/vidocp/config.py
+++ b/vidocp/config.py
@ -36,4 +36,4 @@ class Config:
        return self.__getattr__(item)


-CONFIG = Config(CONFIG_FILE)
+CONFIG = Config(CONFIG_FILE)
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@ -25,7 +25,7 @@ def detect_figures(image: np.array):
    rects = map(cv2.boundingRect, cnts)
    rects = remove_included(rects)

-    return rects
+    return list(rects)


 def detect_figures_in_pdf(pdf_path, page_index=1, show=True):
--- a/vidocp/layout_parsing.py
+++ b/vidocp/layout_parsing.py
@ -32,7 +32,7 @@ def parse_layout(image: np.array):

    image = image.copy()
    image_ = image.copy()
-    
+
    if len(image_.shape) > 2:
        image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2GRAY)
    image_ = cv2.GaussianBlur(image_, (7, 7), 0)
@ -60,7 +60,7 @@ def parse_layout(image: np.array):
    rects = remove_included(rects)
    rects = remove_overlapping(rects)

-    return rects
+    return list(rects)


 def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):
@ -70,7 +70,7 @@ def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):

    rects = parse_layout(page)
    page = draw_rectangles(page, rects)
-    
+
    if show:
        show_mpl(page)
    else:
--- a/vidocp/locations.py
+++ b/vidocp/locations.py
@ -11,4 +11,4 @@ LOG_FILE = "/tmp/log.log"

 DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")

-TEST_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "test", "test_data")
+TEST_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "test", "test_data")
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@ -30,7 +30,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
    contours = map(
        first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
    )
-    return contours
+    return list(contours)


 def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):
--- a/vidocp/table_parsig.py
+++ b/vidocp/table_parsig.py
@ -1,171 +0,0 @@
-from itertools import count
-
-import cv2
-import imutils
-import numpy as np
-import pdf2image
-from matplotlib import pyplot as plt
-
-
-def parse(image: np.array):
-    if len(image.shape) > 2:
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    #plt.imshow(gray_scale)
-    image = cv2.GaussianBlur(image, (7, 7), 2)  #5 5 1
-    image = cv2.threshold(image, 251, 255, cv2.THRESH_BINARY)[1]
-    #plt.imshow(thresh)
-    img_bin = ~image
-
-    line_min_width = 7
-    kernel_h = np.ones((10, line_min_width), np.uint8)
-    kernel_v = np.ones((line_min_width, 10), np.uint8)
-
-    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
-    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-    #plt.imshow(img_bin_h)
-    #plt.imshow(img_bin_v)
-    img_bin_final = img_bin_h | img_bin_v
-    plt.imshow(img_bin_final)
-    contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    contours = imutils.grab_contours(contours)
-    for c in contours:
-        peri = cv2.arcLength(c, True)
-        approx = cv2.approxPolyDP(c, 0.04 * peri, True)
-        yield cv2.boundingRect(approx)
-
-def parse_tables(image: np.array, rects: list):
-    parsed_tables = []
-    for rect in rects:
-        (x,y,w,h) = rect
-        region_of_interest = image[x:x+w, y:y+h]
-        if len(region_of_interest.shape) > 2:
-            region_of_interest = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
-        region_of_interest = cv2.threshold(region_of_interest, 200, 255, cv2.THRESH_BINARY)[1]
-        img_bin = ~region_of_interest
-
-        line_min_width = 5
-        kernel_h = np.ones((1, line_min_width), np.uint8)
-        kernel_v = np.ones((line_min_width, 1), np.uint8)
-
-        img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
-        img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-    # find_and_close_internal_gaps(img_bin_v)
-        img_bin_final = img_bin_h | img_bin_v
-    #plt.imshow(img_bin_final)
-    # find_and_close_internal_gaps(img_bin_final)
-    # find_and_close_edges(img_bin_final)
-
-        _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
-        parsed_tables.append([(x,y,w,h), stats])
-    return parsed_tables
-        #yield (x,y,w,h), stats, region_of_interest
-    # return stats
-
-def annotate_table(image, parsed_tables):
-    for table in parsed_tables:
-        original_coordinates, stats = table
-        stats = filter_unconnected_cells(stats)
-        for stat in stats:
-            x, y, w, h, area = stat
-            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
-            for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
-                anno = f"{s} = {v}"
-                xann = int(x + 5)
-                yann = int(y + h - (20 * (i + 1)))
-                cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
-
-    return image
-
-
-def filter_unconnected_cells(stats):
-    filtered_cells = []
-    # print(stats)
-    for left, middle, right in zip(stats[0:], stats[1:],
-                                   list(stats[2:]) + [np.array([None, None, None, None, None])]):
-        x, y, w, h, area = middle
-        if w > 35 and h > 13 and area > 500:
-            if right[1] is None:
-                if y == left[1] or x == left[0]:
-                    filtered_cells.append(middle)
-            else:
-                if y == left[1] or y == right[1] or x == left[0] or x == right[0]:
-                    filtered_cells.append(middle)
-    return filtered_cells
-
-def find_and_close_edges(img_bin_final):
-    contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
-    for cnt in contours:
-        missing_external_edges = True
-        left = tuple(cnt[cnt[:, :, 0].argmin()][0])
-        right = tuple(cnt[cnt[:, :, 0].argmax()][0])
-        top = tuple(cnt[cnt[:, :, 1].argmin()][0])
-        bottom = tuple(cnt[cnt[:, :, 1].argmax()][0])
-        topleft = [left[0], top[1]]
-        bottomright = [right[0], bottom[1]]
-        for arr in cnt:
-            if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])):
-                missing_external_edges = False
-                break
-
-        if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000:
-            cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2)
-            # print("missing cell detectet rectangle drawn")
-
-    return img_bin_final
-
-
-
-def parse_tables_in_pdf(pages):
-    return zip(map(parse, pages), count())
-
-# def annotate_tables_in_pdf(pdf_path, page_index=1):
-#     # timeit()
-#     page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
-#     page = np.array(page)
-#
-#     _, stats = parse(page)
-#     page = annotate_image(page, stats)
-#     # print(timeit())
-#     fig, ax = plt.subplots(1, 1)
-#     fig.set_size_inches(20, 20)
-#     ax.imshow(page)
-#     plt.show()
-
-
-def annotate_boxes(image, rects):
-    print(type(rects))
-    for rect in rects:
-        (x, y, w, h) = rect
-        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
-
-    return image
-
-def filter_tables_or_images(rects):
-    filtered = []
-    for rect in rects:
-        (x,y,w,h) = rect
-        print(w*h)
-        if w * h > 10**6:
-            filtered.append(rect)
-    print(filtered)
-    return filtered
-
-
-
-
-def annotate_tables_in_pdf(pdf_path, page_index=1):
-    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
-    page = np.array(page)
-
-    layout_boxes = parse(page)
-    page = annotate_boxes(page, layout_boxes)
-    parsed_tables = parse_tables(page, filter_tables_or_images(layout_boxes))
-    page = annotate_table(page, parsed_tables)
-
-
-
-    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
-    ax.imshow(page)
-    plt.show()
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@ -1,5 +1,5 @@
 from functools import partial
-from itertools import chain, compress, starmap
+from itertools import chain, starmap
 from operator import attrgetter

 import cv2
@ -25,7 +25,6 @@ def add_external_contours(image, img):
    return image


-
 def isolate_vertical_and_horizontal_components(img_bin, bounding_rects, show=False):
    line_min_width = 48
    kernel_h = np.ones((1, line_min_width), np.uint8)
@ -77,19 +76,9 @@ def has_table_shape(rects):

    rects = list(map(xywh_to_vec_rect, rects))

-    # print(rects)
-    # print(brect)
-
    def matches_bounding_rect_corner(rect, x, y):
        corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
        ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
-        # print()
-        # print(x, y)
-        # print(brect)
-        # print(rect)
-        # print(corresp_coords)
-        # print(ret)
-
        return ret

    return all(
@ -120,31 +109,36 @@ def find_table_layout_boxes(image: np.array):
    return table_boxes


+def preprocess(image: np.array):
+    if len(image.shape) > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
+    th1, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
+    image = ~image
+    return image
+
+
 def parse_table(image: np.array, show=False):
    def is_large_enough(stat):
        x1, y1, w, h, area = stat
        return area > 2000 and w > 35 and h > 25

-    if len(image.shape) > 2:
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
-    th1, img_bin = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
-    img_bin = ~img_bin
+    image = preprocess(image)
    if show:
-        show_mpl(img_bin)
+        show_mpl(image)

    table_layout_boxes = find_table_layout_boxes(image)
-    img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes)
-    img_bin_final = add_external_contours(img_bin, img_bin)
+    image = isolate_vertical_and_horizontal_components(image, table_layout_boxes)
+    image = add_external_contours(image, image)

-    _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
+    _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)

    stats = np.vstack(list(filter(is_large_enough, stats)))
    rects = stats[:, :-1][2:]

    # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
-    rects = list(remove_isolated(rects, input_sorted=True))
+    rects = remove_isolated(rects, input_sorted=True)

-    return rects
+    return list(rects)


 def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):
@ -155,9 +149,7 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):

    stats = parse_table(page)
    page = draw_rectangles(page, stats, annotate=True)
-    # if stats:
-    #     page = draw_rectangles(page, stats, annotate=True)
-    
+
    if show:
        show_mpl(page)
    else:
--- a/vidocp/table_parsing_2.py
+++ b/vidocp/table_parsing_2.py
@ -1,75 +0,0 @@
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-from pdf2image import pdf2image
-
-
-def add_external_contours(image, img):
-
-    contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
-
-    for cnt in contours:
-        x, y, w, h = cv2.boundingRect(cnt)
-        cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
-
-    return image
-
-
-def isolate_vertical_and_horizontal_components(img_bin):
-
-    line_min_width = 30
-    kernel_h = np.ones((1, line_min_width), np.uint8)
-    kernel_v = np.ones((line_min_width, 1), np.uint8)
-
-    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
-    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-
-    img_bin_final = img_bin_h | img_bin_v
-
-    return img_bin_final
-
-
-def annotate_image(image, stats):
-
-    image = image.copy()
-
-    for x, y, w, h, area in stats[2:]:
-        if w > 10 and h > 10:
-            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
-
-            for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
-                anno = f"{s} = {v}"
-                xann = int(x + 5)
-                yann = int(y + h - (20 * (i + 1)))
-                cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
-
-    return image
-
-
-def parse_table(image: np.array):
-
-    if len(image.shape) > 2:
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    th1, img_bin = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)
-    img_bin = ~img_bin
-
-    img_bin = isolate_vertical_and_horizontal_components(img_bin)
-    img_bin_final = add_external_contours(img_bin, img_bin)
-
-    _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
-
-    return stats
-
-
-def annotate_tables_in_pdf(pdf_path, page_index=1):
-
-    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
-    page = np.array(page)
-
-    stats = parse_table(page)
-    page = annotate_image(page, stats)
-
-    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
-    ax.imshow(page)
-    plt.show()
--- a/vidocp/utils/deskew.py
+++ b/vidocp/utils/deskew.py
@ -2,78 +2,46 @@ import numpy as np
 from scipy.ndimage import rotate
 import cv2

-
-def detect_angle_from_lines(im: np.array, max_skew_deg=10, min_skew_deg=0.1, min_nlines=5) -> int:
-    max_skew_rad = np.deg2rad(max_skew_deg)
-    min_skew_rad = np.deg2rad(min_skew_deg)
-    width = im.shape[1]
-
-    if len(im.shape) > 2:
-        im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
-    im = cv2.fastNlMeansDenoising(im, h=3)
-    im_bw = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
-
-    lines = cv2.HoughLinesP(im_bw, 1, np.pi / 180, 200, minLineLength=width / 12, maxLineGap=width / 150)
-
-    angles = []
-    for line in lines:
-        x1, y1, x2, y2 = line[0]
-        raw_angle = np.arctan2(y2 - y1, x2 - x1)
-        angles.append(min(raw_angle, np.pi / 2 - raw_angle))
-    angles = [angle for angle in angles if (abs(angle) < max_skew_rad)]
-    nonzero = list(filter(lambda x: x != 0, angles))
-
-    # empirically found this ad hoc approach to work
-    robust_avg = (np.mean(angles) + np.mean(nonzero) + np.median(nonzero)) / 3
-    # slightly lower alternative:
-    # robust_avg = (np.mean(angles) + np.mean(nonzero) + np.median(angles) + np.median(nonzero)) / 4
-
-    if robust_avg < min_skew_rad or min(len(angles), len(nonzero)) < min_nlines:
-        return 0.0
-    return np.rad2deg(robust_avg)
+from vidocp.config import CONFIG


 def rotate_straight(im: np.array, skew_angle: int) -> np.array:
    h, w = im.shape[:2]
    center = (w // 2, h // 2)
-
    M = cv2.getRotationMatrix2D(center, skew_angle, 1.0)
-
    rotated = cv2.warpAffine(im, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated


-def deskew_linebased(image: np.array, verbose=False) -> np.array:
-    skew_angle = detect_angle_from_lines(image)
-    if verbose:
-        print(f"Skew angle from lines: {skew_angle}")
-    if skew_angle:
-        deskewed = rotate_straight(image, skew_angle)
-        return deskewed
-    return image
+def find_score(arr, angle):
+    data = rotate(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
+    hist = np.sum(data, axis=1)
+    score = np.sum((hist[1:] - hist[:-1]) ** 2)
+    return score


-def deskew_histbased(page: np.array, preprocess=True, max_abs_angle=1.5, delta=0.15, mode="nearest", verbose=False):
-    if preprocess:
-        if len(page.shape) > 2:
-            page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
-        page = cv2.fastNlMeansDenoising(page, h=3)
-    w, h = page.shape
-
-    def find_score(arr, angle):
-        data = rotate(arr, angle, reshape=False, order=0)
-        hist = np.sum(data, axis=1)
-        score = np.sum((hist[1:] - hist[:-1]) ** 2)
-        return score
-
-    angles = np.arange(-max_abs_angle, max_abs_angle + delta, delta)
-    scores = []
-    for angle in angles:
-        scores.append(find_score(page, angle))
-
+def find_best_angle(page):
+    lim = CONFIG.deskew.max_abs_angle
+    delta = CONFIG.deskew.delta
+    angles = np.arange(-lim, lim + delta, delta)
+    scores = [find_score(page, angle) for angle in angles]
    best_angle = angles[scores.index(max(scores))]
-    if verbose:
+    return best_angle
+
+
+def preprocess(arr: np.array):
+    if len(arr.shape) > 2:
+        arr = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
+    arr = cv2.fastNlMeansDenoising(arr, h=CONFIG.deskew.filter_strength_h)
+    return arr
+
+
+def deskew_histbased(page: np.array):
+    page = preprocess(page)
+    best_angle = find_best_angle(page)
+
+    if CONFIG.deskew.verbose:
        print("Skew angle from pixel histogram: {}".format(best_angle))

-    rotated = rotate(page, best_angle, reshape=False, order=0, mode=mode)
+    rotated = rotate(page, best_angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
    return rotated, best_angle
--- a/vidocp/utils/detection.py
+++ b/vidocp/utils/detection.py
@ -19,6 +19,6 @@ def detect_large_coherent_structures(image: np.array):
    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
    close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)

-    cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    counts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

-    return cnts
+    return counts
--- a/vidocp/utils/logging.py
+++ b/vidocp/utils/logging.py
@ -1,11 +1,10 @@
 """Defines the default logger for the service."""
-
-
 import sys
 import logging

 from vidocp.config import CONFIG

+
 def get_logger():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.getLevelName(CONFIG.service.logging_level))
@ -19,4 +18,5 @@ def get_logger():
    logger.propagate = False
    return logger

-logger = get_logger()
+
+logger = get_logger()
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@ -26,7 +26,7 @@ def remove_included(rectangles):

    def is_not_included(rect, rectangles):
        return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
-        
+
    rectangles = list(map(xywh_to_vec_rect, rectangles))
    rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
    rectangles = map(vec_rect_to_xywh, rectangles)
--- a/vidocp/utils/preprocessing.py
+++ b/vidocp/utils/preprocessing.py
@ -3,6 +3,13 @@ import pdf2image
 import cv2


+def preprocess_pdf_image(page):
+    if len(page.shape) > 2:
+        page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
+    page = cv2.fastNlMeansDenoising(page, h=3)
+    return page
+
+
 def open_pdf(pdf, first_page=0, last_page=None):
    first_page += 1
    last_page = None if last_page is None else last_page + 1
@ -12,12 +19,5 @@ def open_pdf(pdf, first_page=0, last_page=None):
        pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
    elif type(pdf) == list:
        return pdf
-    pages = [array(p) for p in pages]
+    pages = [preprocess_pdf_image(array(p)) for p in pages]
    return pages
-
-
-def preprocess_pdf_image(page):
-    if len(page.shape) > 2:
-        page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
-    page = cv2.fastNlMeansDenoising(page, h=3)
-    return page
--- a/vidocp/utils/utils.py
+++ b/vidocp/utils/utils.py
@ -14,5 +14,6 @@ def copy_and_normalize_channels(image):


 def npconvert(ob):
-    if isinstance(ob, generic): return ob.item()  
-    raise TypeError
+    if isinstance(ob, generic):
+        return ob.item()
+    raise TypeError