From 635fb8481150559b322a6ab40418d186a75cd2dd Mon Sep 17 00:00:00 2001
From: Isaac Riley <Isaac.Riley@iqser.com>
Date: Thu, 17 Mar 2022 21:51:15 +0100
Subject: [PATCH] post-monitoring debug, especially of deskewing and skew check

---
 README.md                     | 37 +++++++-------
 argparse                      |  0
 config.yaml                   |  5 +-
 cv2                           |  0
 np                            |  0
 os                            |  0
 pdf2image                     |  0
 plt                           |  0
 requirements.txt              |  4 +-
 scripts/client_mock.py        | 23 +++++++--
 src/run_service.py            | 92 ++++++++++++++++++++++++++++-------
 vidocp/redaction_detection.py | 11 +++--
 vidocp/utils/deskew.py        | 45 +++++++++++++++--
 vidocp/utils/preprocessing.py |  6 ++-
 yaml                          |  0
 15 files changed, 169 insertions(+), 54 deletions(-)
 create mode 100644 argparse
 create mode 100644 cv2
 create mode 100644 np
 create mode 100644 os
 create mode 100644 pdf2image
 create mode 100644 plt
 create mode 100644 yaml

diff --git a/README.md b/README.md
index 5ce9009..1654cc1 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ dvc pull
 The module provided functions for the individual tasks that all return some kind of collection of points, depending on
 the specific task.
 
-#### Redaction Detection
+#### Redaction Detection (API)
 
 The below snippet shows hot to find the outlines of previous redactions.
 
@@ -44,76 +44,75 @@ page = np.array(page)
 redaction_contours = find_redactions(page)
 ```
 
-
-### As a CLI Tool
-
+## As a CLI Tool
 
 Core API functionalities can be used through a CLI.
 
-
-#### Table Parsing
+### Table Parsing
 
 The tables parsing utility detects and segments tables into individual cells.
+
 ```bash
 python scripts/annotate.py data/test_pdf.pdf 7 --type table
 ```
 
 The below image shows a parsed table, where each table cell has been detected individually.
 
-![](data/table_parsing.png)
+![Table Parsing Demonstration](data/table_parsing.png)
 
-
-#### Redaction Detection
+### Redaction Detection (CLI)
 
 The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
+
 ```bash
 python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
 ```
 
 The below image shows the detected redactions with green outlines.
 
-![](data/redaction_detection.png)
+![Redaction Detection Demonstration](data/redaction_detection.png)
 
-
-#### Layout Parsing
+### Layout Parsing
 
 The layout parsing utility detects elements such as paragraphs, tables and figures.
+
 ```bash
 python scripts/annotate.py data/test_pdf.pdf 7 --type layout
 ```
 
 The below image shows the detected layout elements on a page.
 
-![](data/layout_parsing.png)
+![Layout Parsing Demonstration](data/layout_parsing.png)
 
-
-#### Figure Detection
+### Figure Detection
 
 The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
+
 ```bash
 python scripts/annotate.py data/test_pdf.pdf 3 --type figure
 ```
 
 The below image shows the detected figure on a page.
 
-![](data/figure_detection.png)
-
+![Figure Detection Demonstration](data/figure_detection.png)
 
 ## Running as a service
 
 ### Building
 
 Build base image
+
 ```bash
 bash setup/docker.sh
 ```
 
 Build head image
+
 ```bash
 docker build -f Dockerfile -t vidocp . --build-arg BASE_ROOT=""
 ```
 
-### Usage
+### Usage (service)
 
 Shell 1
 
@@ -125,4 +124,4 @@ Shell 2
 
 ```bash
 python scripts/client_mock.py --pdf_path /path/to/a/pdf
-```
\ No newline at end of file
+```
diff --git a/argparse b/argparse
new file mode 100644
index 0000000..e69de29
diff --git a/config.yaml b/config.yaml
index 888b7dc..5a25df6 100644
--- a/config.yaml
+++ b/config.yaml
@@ -2,6 +2,7 @@ device: cpu
 service:
   logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for log file messages
   logfile_path: $LOGFILE_PATH|null  # Overwrites the default path for the service logfile (image_service/log.log)
+  monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not
 
 webserver:
   host: $SERVER_HOST|"127.0.0.1"  # webserver address
@@ -9,9 +10,11 @@ webserver:
   mode: $SERVER_MODE|production  # webserver mode: {development, production}
 
 deskew:
+  function: identity # function to use: {hist: deskew_histbased, identity: <identity mapping>}
   preprocess: True
   max_abs_angle: 1.5
-  delta: 0.15
+  delta: 0.1
+  test_delta: 0.15
   mode: nearest 
   verbose: False
   filter_strength_h: 3
\ No newline at end of file
diff --git a/cv2 b/cv2
new file mode 100644
index 0000000..e69de29
diff --git a/np b/np
new file mode 100644
index 0000000..e69de29
diff --git a/os b/os
new file mode 100644
index 0000000..e69de29
diff --git a/pdf2image b/pdf2image
new file mode 100644
index 0000000..e69de29
diff --git a/plt b/plt
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index 5245971..cc44721 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,6 @@ waitress~=2.0
 pytest~=6.2
 envyaml~=1.8
 coverage~=5.5
-dependency-check~=0.6.0
\ No newline at end of file
+dependency-check~=0.6.0
+prometheus-client~=0.13.1
+prometheus_flask_exporter~=0.19.0
\ No newline at end of file
diff --git a/scripts/client_mock.py b/scripts/client_mock.py
index 6646c7f..d6f7ff5 100644
--- a/scripts/client_mock.py
+++ b/scripts/client_mock.py
@@ -1,6 +1,7 @@
 # python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing
 import argparse
 import json
+from multiprocessing.sharedctypes import Value
 import requests
 
 from vidocp.utils.preprocessing import open_pdf
@@ -40,11 +41,25 @@ def main(args):
     #     {"operations": args.operations.split(",")}
     #     )
     # }
-    response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
-    response.raise_for_status()
-    predictions = response.json()
+    operations = args.operations.split(",")
+    for operation in operations:
+        print("****************************")
+        print(f"{' '+operation+' ':^27}")
+        print("****************************")
+        if operation == "table-parsing":
+            response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
+        elif operation == "redaction-detection":
+            response = requests.post("http://127.0.0.1:5000/redactions", data=open(args.pdf_path, "rb"))
+        elif operation == "figure-detection":
+            response = requests.post("http://127.0.0.1:5000/figures", data=open(args.pdf_path, "rb"))
+        elif operation == "layout-parsing":
+            response = requests.post("http://127.0.0.1:5000/layout", data=open(args.pdf_path, "rb"))
+        else:
+            raise ValueError("{args.operation} is not a valid value.")
+        response.raise_for_status()
+        predictions = response.json()
 
-    print(json.dumps(predictions, indent=2))
+        print(json.dumps(predictions, indent=2))
 
 
 if __name__ == "__main__":
diff --git a/src/run_service.py b/src/run_service.py
index a74d4ab..ae8a666 100644
--- a/src/run_service.py
+++ b/src/run_service.py
@@ -1,8 +1,11 @@
-import argparse
 import json
+import tracemalloc
+from sys import getsizeof
 import logging
 from typing import List
 from flask import Flask, request, jsonify
+from prometheus_client import Counter, Gauge
+from prometheus_flask_exporter import PrometheusMetrics
 from waitress import serve
 
 from vidocp.utils import npconvert
@@ -24,27 +27,56 @@ def suppress_user_warnings():
 
 def main():
     run_server()
-
+    
 
 def run_server():
-    app = Flask(__name__)
+    file_counter = Counter("vidocp_file_counter", "count processed files")
+    #page_counter = Counter("vidocp_page_counter", "count pages from processed files")
+    ram_metric = Gauge("vidocp_memory_usage", "Memory usage in Mb")
 
+    def start_monitoring():
+        file_counter.inc()
+        _, peak = tracemalloc.get_traced_memory()
+        ram_metric.set(peak / 10 ** 6)
+
+    logger.info(make_art())
+    tracemalloc.start()
+    
+    app = Flask(__name__)
+    metrics = PrometheusMetrics(app=app, path='/prometheus')
+    
     @app.route("/tables", methods=["POST"])
+    @metrics.summary('tables_request_time_seconds', 'Time spent processing tables request')
     def get_tables():
-        return annotate("tables")
+        start_monitoring()
+        tables = annotate("tables")
+        #page_counter.inc(npages)
+        return tables
 
     @app.route("/redactions", methods=["POST"])
+    @metrics.summary('redactions_request_time_seconds', 'Time spent processing redaction request')
     def get_redactions():
-        return annotate("redactions")
-
+        start_monitoring()
+        redactions = annotate("redactions")
+        #page_counter.inc(npages)
+        return redactions
+        
     @app.route("/figures", methods=["POST"])
+    @metrics.summary('figures_request_time_seconds', 'Time spent processing figures request')
     def get_figures():
-        return annotate("figures")
-    
+        start_monitoring()
+        figures = annotate("figures")
+        #page_counter.inc(npages)
+        return figures
+        
     @app.route("/layout", methods=["POST"])
+    @metrics.summary('layout_request_time_seconds', 'Time spent processing layout request')
     def get_layout():
-        return annotate("layout")
-
+        start_monitoring()
+        layout = annotate("layout")
+        #page_counter.inc(npages)
+        return layout
+        
     @app.route("/status", methods=["GET"])
     def status():
         response = "OK"
@@ -59,6 +91,7 @@ def run_server():
     elif mode == "production":
         serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
         logging.info("Production.")
+    tracemalloc.stop()
 
 
 def apply_annotation_function(annotation_function, page_list):
@@ -70,9 +103,7 @@ def apply_annotation_function(annotation_function, page_list):
     return outdict
 
 
-def make_annotations(pdf_data, task):
-    pdf = open_pdf(pdf_data)
-    
+def make_annotations(pdf, task):
     if task == "tables":
         annotation = {"tables": apply_annotation_function(parse_table, pdf)}
     elif task == "redactions":
@@ -88,13 +119,19 @@ def make_annotations(pdf_data, task):
     return json.dumps(annotation, default=npconvert)
 
 
+def get_size(data):
+    return round(getsizeof(data) / 1000000, 2)
+
+
 def annotate(task):
     def inner():
-            data = request.data
-            logger.info(f"<3 Received data.")
-            logger.info(f"Processing data. <3")
-            annotations = make_annotations(data, task)
-            return jsonify({"result": annotations})
+        data = request.data
+        logger.info(f"<3 Received data.")
+        logger.info(f"Processing data. <3")
+        pdf, angles = open_pdf(data)
+        #npages = len(pdf)
+        annotations = make_annotations(pdf, task)
+        return jsonify({"result": annotations, "deskew_angles": angles})
     try:
         return inner()
     except Exception as err:
@@ -103,7 +140,26 @@ def annotate(task):
         resp = jsonify("Analysis failed")
         resp.status_code = 500
         return resp
+
+
+def make_art():
+    art = """
     
+    =================================================================================================
+    ==  ====  ==============       =================       ==========================================
+    ==  ====  ==============  ====  ================  ====  =========================================
+    ==  ====  ==============  ====  ================  ====  =========================================
+    ==  ====  ==  ==      ==  ====  ===   ====   ===  ====  ===   ===  =   ====   ====   ===  =   ===
+    ==   ==   ==========  ==  ====  ==     ==  =  ==       ===  =  ==    =  ==  =  ==  =  ==    =  ==
+    ===  ==  ===  =====  ===  ====  ==  =  ==  =====  ===========  ==  ========  ====     ==  =======
+    ===  ==  ===  ====  ====  ====  ==  =  ==  =====  =========    ==  =========  ===  =====  =======
+    ====    ====  ===  =====  ====  ==  =  ==  =  ==  ========  =  ==  =======  =  ==  =  ==  =======
+    =====  =====  ==      ==       ====   ====   ===  =========    ==  ========   ====   ===  =======
+    =================================================================================================
+
+"""
+    return art
+
 
 if __name__ == "__main__":
     main()
diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py
index bbc2aad..25f8d1f 100644
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@@ -27,10 +27,13 @@ def find_redactions(image: np.array, min_normalized_area=200000):
 
     contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
 
-    contours = map(
-        first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
-    )
-    return list(contours)
+    try:
+        contours = map(
+            first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
+        )
+        return list(contours)
+    except:
+        return []
 
 
 def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):
diff --git a/vidocp/utils/deskew.py b/vidocp/utils/deskew.py
index f3f7a71..8776a2c 100644
--- a/vidocp/utils/deskew.py
+++ b/vidocp/utils/deskew.py
@@ -1,5 +1,5 @@
 import numpy as np
-from scipy.ndimage import rotate
+from scipy.ndimage import rotate as rotate_
 import cv2
 
 from vidocp.config import CONFIG
@@ -14,7 +14,7 @@ def rotate_straight(im: np.array, skew_angle: int) -> np.array:
 
 
 def find_score(arr, angle):
-    data = rotate(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
+    data = rotate_(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
     hist = np.sum(data, axis=1)
     score = np.sum((hist[1:] - hist[:-1]) ** 2)
     return score
@@ -36,12 +36,47 @@ def preprocess(arr: np.array):
     return arr
 
 
+def rotate(page, angle):
+    rotated = rotate_(page, angle, reshape=False, order=0, mode="nearest")
+    return rotated
+
+
 def deskew_histbased(page: np.array):
     page = preprocess(page)
-    best_angle = find_best_angle(page)
+    best_angle = round(find_best_angle(page), 3)
 
     if CONFIG.deskew.verbose:
         print("Skew angle from pixel histogram: {}".format(best_angle))
 
-    rotated = rotate(page, best_angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
-    return rotated, best_angle
+    rotated = rotate(page, best_angle)
+    return (rotated, best_angle)
+
+
+def needs_deskew(page: np.array) -> bool:
+    """
+    Makes use of 'row-wise mean difference' - the difference between neighboring  - on left and right halves 
+    """
+    
+    def split_rowmean_diff(page):
+        width = page.shape[1]
+        cutpoint = int(width / 2)
+        left = page[:, :cutpoint]
+        right = page[:, cutpoint:]
+        leftmeans = np.mean(left, axis=1)
+        rightmeans = np.mean(right, axis=1)
+        return rightmeans - leftmeans    
+    
+    unrotated_score = np.mean(np.abs(split_rowmean_diff(page)))
+    angles = [-CONFIG.deskew.test_delta, CONFIG.deskew.test_delta]
+    scores = [np.mean(np.abs(split_rowmean_diff(rotate(page, angle)))) for angle in angles]
+    print(unrotated_score, scores)
+    return unrotated_score > min(scores)
+
+
+print(CONFIG)
+if CONFIG.deskew.function == "hist":
+    deskew = lambda page: deskew_histbased(page) if needs_deskew(page) else (page, 0)
+elif CONFIG.deskew.function == "identity":
+    deskew = lambda page: (page, None)
+else:
+    raise ValueError("'{CONFIG.deskew.function}' is not a valid parameter value for CONFIG.deskew.function")
\ No newline at end of file
diff --git a/vidocp/utils/preprocessing.py b/vidocp/utils/preprocessing.py
index df26f88..07c2869 100644
--- a/vidocp/utils/preprocessing.py
+++ b/vidocp/utils/preprocessing.py
@@ -2,12 +2,14 @@ from numpy import array
 import pdf2image
 import cv2
 
+from vidocp.utils.deskew import deskew
+
 
 def preprocess_pdf_image(page):
     if len(page.shape) > 2:
         page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
     page = cv2.fastNlMeansDenoising(page, h=3)
-    return page
+    return deskew(page)
 
 
 def open_pdf(pdf, first_page=0, last_page=None):
@@ -20,4 +22,4 @@ def open_pdf(pdf, first_page=0, last_page=None):
     elif type(pdf) == list:
         return pdf
     pages = [preprocess_pdf_image(array(p)) for p in pages]
-    return pages
+    return list(zip(*pages))
diff --git a/yaml b/yaml
new file mode 100644
index 0000000..e69de29