post-monitoring debug, especially of deskewing and skew check

This commit is contained in:
Isaac Riley 2022-03-17 21:51:15 +01:00
parent fa479adfb0
commit 635fb84811
15 changed files with 169 additions and 54 deletions

View File

@ -25,7 +25,7 @@ dvc pull
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
the specific task.
#### Redaction Detection
#### Redaction Detection (API)
The below snippet shows hot to find the outlines of previous redactions.
@ -44,76 +44,75 @@ page = np.array(page)
redaction_contours = find_redactions(page)
```
### As a CLI Tool
## As a CLI Tool
Core API functionalities can be used through a CLI.
#### Table Parsing
### Table Parsing
The tables parsing utility detects and segments tables into individual cells.
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type table
```
The below image shows a parsed table, where each table cell has been detected individually.
![](data/table_parsing.png)
![Table Parsing Demonstration](data/table_parsing.png)
#### Redaction Detection
### Redaction Detection (CLI)
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
```bash
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
```
The below image shows the detected redactions with green outlines.
![](data/redaction_detection.png)
![Redaction Detection Demonstration](data/redaction_detection.png)
#### Layout Parsing
### Layout Parsing
The layout parsing utility detects elements such as paragraphs, tables and figures.
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
```
The below image shows the detected layout elements on a page.
![](data/layout_parsing.png)
![Layout Parsing Demonstration](data/layout_parsing.png)
#### Figure Detection
### Figure Detection
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
```bash
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
```
The below image shows the detected figure on a page.
![](data/figure_detection.png)
![Figure Detection Demonstration](data/figure_detection.png)
## Running as a service
### Building
Build base image
```bash
bash setup/docker.sh
```
Build head image
```bash
docker build -f Dockerfile -t vidocp . --build-arg BASE_ROOT=""
```
### Usage
### Usage (service)
Shell 1

0
argparse Normal file
View File

View File

@ -2,6 +2,7 @@ device: cpu
service:
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for log file messages
logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log)
monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not
webserver:
host: $SERVER_HOST|"127.0.0.1" # webserver address
@ -9,9 +10,11 @@ webserver:
mode: $SERVER_MODE|production # webserver mode: {development, production}
deskew:
function: identity # function to use: {hist: deskew_histbased, identity: <identity mapping>}
preprocess: True
max_abs_angle: 1.5
delta: 0.15
delta: 0.1
test_delta: 0.15
mode: nearest
verbose: False
filter_strength_h: 3

0
cv2 Normal file
View File

0
np Normal file
View File

0
os Normal file
View File

0
pdf2image Normal file
View File

0
plt Normal file
View File

View File

@ -14,3 +14,5 @@ pytest~=6.2
envyaml~=1.8
coverage~=5.5
dependency-check~=0.6.0
prometheus-client~=0.13.1
prometheus_flask_exporter~=0.19.0

View File

@ -1,6 +1,7 @@
# python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing
import argparse
import json
from multiprocessing.sharedctypes import Value
import requests
from vidocp.utils.preprocessing import open_pdf
@ -40,11 +41,25 @@ def main(args):
# {"operations": args.operations.split(",")}
# )
# }
response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
response.raise_for_status()
predictions = response.json()
operations = args.operations.split(",")
for operation in operations:
print("****************************")
print(f"{' '+operation+' ':^27}")
print("****************************")
if operation == "table-parsing":
response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
elif operation == "redaction-detection":
response = requests.post("http://127.0.0.1:5000/redactions", data=open(args.pdf_path, "rb"))
elif operation == "figure-detection":
response = requests.post("http://127.0.0.1:5000/figures", data=open(args.pdf_path, "rb"))
elif operation == "layout-parsing":
response = requests.post("http://127.0.0.1:5000/layout", data=open(args.pdf_path, "rb"))
else:
raise ValueError("{args.operation} is not a valid value.")
response.raise_for_status()
predictions = response.json()
print(json.dumps(predictions, indent=2))
print(json.dumps(predictions, indent=2))
if __name__ == "__main__":

View File

@ -1,8 +1,11 @@
import argparse
import json
import tracemalloc
from sys import getsizeof
import logging
from typing import List
from flask import Flask, request, jsonify
from prometheus_client import Counter, Gauge
from prometheus_flask_exporter import PrometheusMetrics
from waitress import serve
from vidocp.utils import npconvert
@ -27,23 +30,52 @@ def main():
def run_server():
file_counter = Counter("vidocp_file_counter", "count processed files")
#page_counter = Counter("vidocp_page_counter", "count pages from processed files")
ram_metric = Gauge("vidocp_memory_usage", "Memory usage in Mb")
def start_monitoring():
file_counter.inc()
_, peak = tracemalloc.get_traced_memory()
ram_metric.set(peak / 10 ** 6)
logger.info(make_art())
tracemalloc.start()
app = Flask(__name__)
metrics = PrometheusMetrics(app=app, path='/prometheus')
@app.route("/tables", methods=["POST"])
@metrics.summary('tables_request_time_seconds', 'Time spent processing tables request')
def get_tables():
return annotate("tables")
start_monitoring()
tables = annotate("tables")
#page_counter.inc(npages)
return tables
@app.route("/redactions", methods=["POST"])
@metrics.summary('redactions_request_time_seconds', 'Time spent processing redaction request')
def get_redactions():
return annotate("redactions")
start_monitoring()
redactions = annotate("redactions")
#page_counter.inc(npages)
return redactions
@app.route("/figures", methods=["POST"])
@metrics.summary('figures_request_time_seconds', 'Time spent processing figures request')
def get_figures():
return annotate("figures")
start_monitoring()
figures = annotate("figures")
#page_counter.inc(npages)
return figures
@app.route("/layout", methods=["POST"])
@metrics.summary('layout_request_time_seconds', 'Time spent processing layout request')
def get_layout():
return annotate("layout")
start_monitoring()
layout = annotate("layout")
#page_counter.inc(npages)
return layout
@app.route("/status", methods=["GET"])
def status():
@ -59,6 +91,7 @@ def run_server():
elif mode == "production":
serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
logging.info("Production.")
tracemalloc.stop()
def apply_annotation_function(annotation_function, page_list):
@ -70,9 +103,7 @@ def apply_annotation_function(annotation_function, page_list):
return outdict
def make_annotations(pdf_data, task):
pdf = open_pdf(pdf_data)
def make_annotations(pdf, task):
if task == "tables":
annotation = {"tables": apply_annotation_function(parse_table, pdf)}
elif task == "redactions":
@ -88,13 +119,19 @@ def make_annotations(pdf_data, task):
return json.dumps(annotation, default=npconvert)
def get_size(data):
return round(getsizeof(data) / 1000000, 2)
def annotate(task):
def inner():
data = request.data
logger.info(f"<3 Received data.")
logger.info(f"Processing data. <3")
annotations = make_annotations(data, task)
return jsonify({"result": annotations})
data = request.data
logger.info(f"<3 Received data.")
logger.info(f"Processing data. <3")
pdf, angles = open_pdf(data)
#npages = len(pdf)
annotations = make_annotations(pdf, task)
return jsonify({"result": annotations, "deskew_angles": angles})
try:
return inner()
except Exception as err:
@ -105,5 +142,24 @@ def annotate(task):
return resp
def make_art():
art = """
=================================================================================================
== ==== ============== ================= ==========================================
== ==== ============== ==== ================ ==== =========================================
== ==== ============== ==== ================ ==== =========================================
== ==== == == == ==== === ==== === ==== === === = ==== ==== === = ===
== == ========== == ==== == == = == === = == = == = == = == = ==
=== == === ===== === ==== == = == ===== =========== == ======== ==== == =======
=== == === ==== ==== ==== == = == ===== ========= == ========= === ===== =======
==== ==== === ===== ==== == = == = == ======== = == ======= = == = == =======
===== ===== == == ==== ==== === ========= == ======== ==== === =======
=================================================================================================
"""
return art
if __name__ == "__main__":
main()

View File

@ -27,10 +27,13 @@ def find_redactions(image: np.array, min_normalized_area=200000):
contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
contours = map(
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
)
return list(contours)
try:
contours = map(
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
)
return list(contours)
except:
return []
def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):

View File

@ -1,5 +1,5 @@
import numpy as np
from scipy.ndimage import rotate
from scipy.ndimage import rotate as rotate_
import cv2
from vidocp.config import CONFIG
@ -14,7 +14,7 @@ def rotate_straight(im: np.array, skew_angle: int) -> np.array:
def find_score(arr, angle):
data = rotate(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
data = rotate_(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
hist = np.sum(data, axis=1)
score = np.sum((hist[1:] - hist[:-1]) ** 2)
return score
@ -36,12 +36,47 @@ def preprocess(arr: np.array):
return arr
def rotate(page, angle):
rotated = rotate_(page, angle, reshape=False, order=0, mode="nearest")
return rotated
def deskew_histbased(page: np.array):
page = preprocess(page)
best_angle = find_best_angle(page)
best_angle = round(find_best_angle(page), 3)
if CONFIG.deskew.verbose:
print("Skew angle from pixel histogram: {}".format(best_angle))
rotated = rotate(page, best_angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
return rotated, best_angle
rotated = rotate(page, best_angle)
return (rotated, best_angle)
def needs_deskew(page: np.array) -> bool:
"""
Makes use of 'row-wise mean difference' - the difference between neighboring - on left and right halves
"""
def split_rowmean_diff(page):
width = page.shape[1]
cutpoint = int(width / 2)
left = page[:, :cutpoint]
right = page[:, cutpoint:]
leftmeans = np.mean(left, axis=1)
rightmeans = np.mean(right, axis=1)
return rightmeans - leftmeans
unrotated_score = np.mean(np.abs(split_rowmean_diff(page)))
angles = [-CONFIG.deskew.test_delta, CONFIG.deskew.test_delta]
scores = [np.mean(np.abs(split_rowmean_diff(rotate(page, angle)))) for angle in angles]
print(unrotated_score, scores)
return unrotated_score > min(scores)
print(CONFIG)
if CONFIG.deskew.function == "hist":
deskew = lambda page: deskew_histbased(page) if needs_deskew(page) else (page, 0)
elif CONFIG.deskew.function == "identity":
deskew = lambda page: (page, None)
else:
raise ValueError("'{CONFIG.deskew.function}' is not a valid parameter value for CONFIG.deskew.function")

View File

@ -2,12 +2,14 @@ from numpy import array
import pdf2image
import cv2
from vidocp.utils.deskew import deskew
def preprocess_pdf_image(page):
if len(page.shape) > 2:
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
page = cv2.fastNlMeansDenoising(page, h=3)
return page
return deskew(page)
def open_pdf(pdf, first_page=0, last_page=None):
@ -20,4 +22,4 @@ def open_pdf(pdf, first_page=0, last_page=None):
elif type(pdf) == list:
return pdf
pages = [preprocess_pdf_image(array(p)) for p in pages]
return pages
return list(zip(*pages))

0
yaml Normal file
View File