From 635fb8481150559b322a6ab40418d186a75cd2dd Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Thu, 17 Mar 2022 21:51:15 +0100 Subject: [PATCH] post-monitoring debug, especially of deskewing and skew check --- README.md | 37 +++++++------- argparse | 0 config.yaml | 5 +- cv2 | 0 np | 0 os | 0 pdf2image | 0 plt | 0 requirements.txt | 4 +- scripts/client_mock.py | 23 +++++++-- src/run_service.py | 92 ++++++++++++++++++++++++++++------- vidocp/redaction_detection.py | 11 +++-- vidocp/utils/deskew.py | 45 +++++++++++++++-- vidocp/utils/preprocessing.py | 6 ++- yaml | 0 15 files changed, 169 insertions(+), 54 deletions(-) create mode 100644 argparse create mode 100644 cv2 create mode 100644 np create mode 100644 os create mode 100644 pdf2image create mode 100644 plt create mode 100644 yaml diff --git a/README.md b/README.md index 5ce9009..1654cc1 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ dvc pull The module provided functions for the individual tasks that all return some kind of collection of points, depending on the specific task. -#### Redaction Detection +#### Redaction Detection (API) The below snippet shows hot to find the outlines of previous redactions. @@ -44,76 +44,75 @@ page = np.array(page) redaction_contours = find_redactions(page) ``` - -### As a CLI Tool - +## As a CLI Tool Core API functionalities can be used through a CLI. - -#### Table Parsing +### Table Parsing The tables parsing utility detects and segments tables into individual cells. + ```bash python scripts/annotate.py data/test_pdf.pdf 7 --type table ``` The below image shows a parsed table, where each table cell has been detected individually. -![](data/table_parsing.png) +![Table Parsing Demonstration](data/table_parsing.png) - -#### Redaction Detection +### Redaction Detection (CLI) The redaction detection utility detects previous redactions in PDFs (filled black rectangles). + ```bash python scripts/annotate.py data/test_pdf.pdf 2 --type redaction ``` The below image shows the detected redactions with green outlines. -![](data/redaction_detection.png) +![Redaction Detection Demonstration](data/redaction_detection.png) - -#### Layout Parsing +### Layout Parsing The layout parsing utility detects elements such as paragraphs, tables and figures. + ```bash python scripts/annotate.py data/test_pdf.pdf 7 --type layout ``` The below image shows the detected layout elements on a page. -![](data/layout_parsing.png) +![Layout Parsing Demonstration](data/layout_parsing.png) - -#### Figure Detection +### Figure Detection The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility. + ```bash python scripts/annotate.py data/test_pdf.pdf 3 --type figure ``` The below image shows the detected figure on a page. -![](data/figure_detection.png) - +![Figure Detection Demonstration](data/figure_detection.png) ## Running as a service ### Building Build base image + ```bash bash setup/docker.sh ``` Build head image + ```bash docker build -f Dockerfile -t vidocp . --build-arg BASE_ROOT="" ``` -### Usage +### Usage (service) Shell 1 @@ -125,4 +124,4 @@ Shell 2 ```bash python scripts/client_mock.py --pdf_path /path/to/a/pdf -``` \ No newline at end of file +``` diff --git a/argparse b/argparse new file mode 100644 index 0000000..e69de29 diff --git a/config.yaml b/config.yaml index 888b7dc..5a25df6 100644 --- a/config.yaml +++ b/config.yaml @@ -2,6 +2,7 @@ device: cpu service: logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for log file messages logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log) + monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not webserver: host: $SERVER_HOST|"127.0.0.1" # webserver address @@ -9,9 +10,11 @@ webserver: mode: $SERVER_MODE|production # webserver mode: {development, production} deskew: + function: identity # function to use: {hist: deskew_histbased, identity: } preprocess: True max_abs_angle: 1.5 - delta: 0.15 + delta: 0.1 + test_delta: 0.15 mode: nearest verbose: False filter_strength_h: 3 \ No newline at end of file diff --git a/cv2 b/cv2 new file mode 100644 index 0000000..e69de29 diff --git a/np b/np new file mode 100644 index 0000000..e69de29 diff --git a/os b/os new file mode 100644 index 0000000..e69de29 diff --git a/pdf2image b/pdf2image new file mode 100644 index 0000000..e69de29 diff --git a/plt b/plt new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 5245971..cc44721 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,6 @@ waitress~=2.0 pytest~=6.2 envyaml~=1.8 coverage~=5.5 -dependency-check~=0.6.0 \ No newline at end of file +dependency-check~=0.6.0 +prometheus-client~=0.13.1 +prometheus_flask_exporter~=0.19.0 \ No newline at end of file diff --git a/scripts/client_mock.py b/scripts/client_mock.py index 6646c7f..d6f7ff5 100644 --- a/scripts/client_mock.py +++ b/scripts/client_mock.py @@ -1,6 +1,7 @@ # python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing import argparse import json +from multiprocessing.sharedctypes import Value import requests from vidocp.utils.preprocessing import open_pdf @@ -40,11 +41,25 @@ def main(args): # {"operations": args.operations.split(",")} # ) # } - response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb")) - response.raise_for_status() - predictions = response.json() + operations = args.operations.split(",") + for operation in operations: + print("****************************") + print(f"{' '+operation+' ':^27}") + print("****************************") + if operation == "table-parsing": + response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb")) + elif operation == "redaction-detection": + response = requests.post("http://127.0.0.1:5000/redactions", data=open(args.pdf_path, "rb")) + elif operation == "figure-detection": + response = requests.post("http://127.0.0.1:5000/figures", data=open(args.pdf_path, "rb")) + elif operation == "layout-parsing": + response = requests.post("http://127.0.0.1:5000/layout", data=open(args.pdf_path, "rb")) + else: + raise ValueError("{args.operation} is not a valid value.") + response.raise_for_status() + predictions = response.json() - print(json.dumps(predictions, indent=2)) + print(json.dumps(predictions, indent=2)) if __name__ == "__main__": diff --git a/src/run_service.py b/src/run_service.py index a74d4ab..ae8a666 100644 --- a/src/run_service.py +++ b/src/run_service.py @@ -1,8 +1,11 @@ -import argparse import json +import tracemalloc +from sys import getsizeof import logging from typing import List from flask import Flask, request, jsonify +from prometheus_client import Counter, Gauge +from prometheus_flask_exporter import PrometheusMetrics from waitress import serve from vidocp.utils import npconvert @@ -24,27 +27,56 @@ def suppress_user_warnings(): def main(): run_server() - + def run_server(): - app = Flask(__name__) + file_counter = Counter("vidocp_file_counter", "count processed files") + #page_counter = Counter("vidocp_page_counter", "count pages from processed files") + ram_metric = Gauge("vidocp_memory_usage", "Memory usage in Mb") + def start_monitoring(): + file_counter.inc() + _, peak = tracemalloc.get_traced_memory() + ram_metric.set(peak / 10 ** 6) + + logger.info(make_art()) + tracemalloc.start() + + app = Flask(__name__) + metrics = PrometheusMetrics(app=app, path='/prometheus') + @app.route("/tables", methods=["POST"]) + @metrics.summary('tables_request_time_seconds', 'Time spent processing tables request') def get_tables(): - return annotate("tables") + start_monitoring() + tables = annotate("tables") + #page_counter.inc(npages) + return tables @app.route("/redactions", methods=["POST"]) + @metrics.summary('redactions_request_time_seconds', 'Time spent processing redaction request') def get_redactions(): - return annotate("redactions") - + start_monitoring() + redactions = annotate("redactions") + #page_counter.inc(npages) + return redactions + @app.route("/figures", methods=["POST"]) + @metrics.summary('figures_request_time_seconds', 'Time spent processing figures request') def get_figures(): - return annotate("figures") - + start_monitoring() + figures = annotate("figures") + #page_counter.inc(npages) + return figures + @app.route("/layout", methods=["POST"]) + @metrics.summary('layout_request_time_seconds', 'Time spent processing layout request') def get_layout(): - return annotate("layout") - + start_monitoring() + layout = annotate("layout") + #page_counter.inc(npages) + return layout + @app.route("/status", methods=["GET"]) def status(): response = "OK" @@ -59,6 +91,7 @@ def run_server(): elif mode == "production": serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port) logging.info("Production.") + tracemalloc.stop() def apply_annotation_function(annotation_function, page_list): @@ -70,9 +103,7 @@ def apply_annotation_function(annotation_function, page_list): return outdict -def make_annotations(pdf_data, task): - pdf = open_pdf(pdf_data) - +def make_annotations(pdf, task): if task == "tables": annotation = {"tables": apply_annotation_function(parse_table, pdf)} elif task == "redactions": @@ -88,13 +119,19 @@ def make_annotations(pdf_data, task): return json.dumps(annotation, default=npconvert) +def get_size(data): + return round(getsizeof(data) / 1000000, 2) + + def annotate(task): def inner(): - data = request.data - logger.info(f"<3 Received data.") - logger.info(f"Processing data. <3") - annotations = make_annotations(data, task) - return jsonify({"result": annotations}) + data = request.data + logger.info(f"<3 Received data.") + logger.info(f"Processing data. <3") + pdf, angles = open_pdf(data) + #npages = len(pdf) + annotations = make_annotations(pdf, task) + return jsonify({"result": annotations, "deskew_angles": angles}) try: return inner() except Exception as err: @@ -103,7 +140,26 @@ def annotate(task): resp = jsonify("Analysis failed") resp.status_code = 500 return resp + + +def make_art(): + art = """ + ================================================================================================= + == ==== ============== ================= ========================================== + == ==== ============== ==== ================ ==== ========================================= + == ==== ============== ==== ================ ==== ========================================= + == ==== == == == ==== === ==== === ==== === === = ==== ==== === = === + == == ========== == ==== == == = == === = == = == = == = == = == + === == === ===== === ==== == = == ===== =========== == ======== ==== == ======= + === == === ==== ==== ==== == = == ===== ========= == ========= === ===== ======= + ==== ==== === ===== ==== == = == = == ======== = == ======= = == = == ======= + ===== ===== == == ==== ==== === ========= == ======== ==== === ======= + ================================================================================================= + +""" + return art + if __name__ == "__main__": main() diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index bbc2aad..25f8d1f 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -27,10 +27,13 @@ def find_redactions(image: np.array, min_normalized_area=200000): contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) - contours = map( - first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0])) - ) - return list(contours) + try: + contours = map( + first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0])) + ) + return list(contours) + except: + return [] def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True): diff --git a/vidocp/utils/deskew.py b/vidocp/utils/deskew.py index f3f7a71..8776a2c 100644 --- a/vidocp/utils/deskew.py +++ b/vidocp/utils/deskew.py @@ -1,5 +1,5 @@ import numpy as np -from scipy.ndimage import rotate +from scipy.ndimage import rotate as rotate_ import cv2 from vidocp.config import CONFIG @@ -14,7 +14,7 @@ def rotate_straight(im: np.array, skew_angle: int) -> np.array: def find_score(arr, angle): - data = rotate(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode) + data = rotate_(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode) hist = np.sum(data, axis=1) score = np.sum((hist[1:] - hist[:-1]) ** 2) return score @@ -36,12 +36,47 @@ def preprocess(arr: np.array): return arr +def rotate(page, angle): + rotated = rotate_(page, angle, reshape=False, order=0, mode="nearest") + return rotated + + def deskew_histbased(page: np.array): page = preprocess(page) - best_angle = find_best_angle(page) + best_angle = round(find_best_angle(page), 3) if CONFIG.deskew.verbose: print("Skew angle from pixel histogram: {}".format(best_angle)) - rotated = rotate(page, best_angle, reshape=False, order=0, mode=CONFIG.deskew.mode) - return rotated, best_angle + rotated = rotate(page, best_angle) + return (rotated, best_angle) + + +def needs_deskew(page: np.array) -> bool: + """ + Makes use of 'row-wise mean difference' - the difference between neighboring - on left and right halves + """ + + def split_rowmean_diff(page): + width = page.shape[1] + cutpoint = int(width / 2) + left = page[:, :cutpoint] + right = page[:, cutpoint:] + leftmeans = np.mean(left, axis=1) + rightmeans = np.mean(right, axis=1) + return rightmeans - leftmeans + + unrotated_score = np.mean(np.abs(split_rowmean_diff(page))) + angles = [-CONFIG.deskew.test_delta, CONFIG.deskew.test_delta] + scores = [np.mean(np.abs(split_rowmean_diff(rotate(page, angle)))) for angle in angles] + print(unrotated_score, scores) + return unrotated_score > min(scores) + + +print(CONFIG) +if CONFIG.deskew.function == "hist": + deskew = lambda page: deskew_histbased(page) if needs_deskew(page) else (page, 0) +elif CONFIG.deskew.function == "identity": + deskew = lambda page: (page, None) +else: + raise ValueError("'{CONFIG.deskew.function}' is not a valid parameter value for CONFIG.deskew.function") \ No newline at end of file diff --git a/vidocp/utils/preprocessing.py b/vidocp/utils/preprocessing.py index df26f88..07c2869 100644 --- a/vidocp/utils/preprocessing.py +++ b/vidocp/utils/preprocessing.py @@ -2,12 +2,14 @@ from numpy import array import pdf2image import cv2 +from vidocp.utils.deskew import deskew + def preprocess_pdf_image(page): if len(page.shape) > 2: page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY) page = cv2.fastNlMeansDenoising(page, h=3) - return page + return deskew(page) def open_pdf(pdf, first_page=0, last_page=None): @@ -20,4 +22,4 @@ def open_pdf(pdf, first_page=0, last_page=None): elif type(pdf) == list: return pdf pages = [preprocess_pdf_image(array(p)) for p in pages] - return pages + return list(zip(*pages)) diff --git a/yaml b/yaml new file mode 100644 index 0000000..e69de29