post-monitoring debug, especially of deskewing and skew check
This commit is contained in:
parent
fa479adfb0
commit
635fb84811
35
README.md
35
README.md
@ -25,7 +25,7 @@ dvc pull
|
|||||||
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||||
the specific task.
|
the specific task.
|
||||||
|
|
||||||
#### Redaction Detection
|
#### Redaction Detection (API)
|
||||||
|
|
||||||
The below snippet shows hot to find the outlines of previous redactions.
|
The below snippet shows hot to find the outlines of previous redactions.
|
||||||
|
|
||||||
@ -44,76 +44,75 @@ page = np.array(page)
|
|||||||
redaction_contours = find_redactions(page)
|
redaction_contours = find_redactions(page)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## As a CLI Tool
|
||||||
### As a CLI Tool
|
|
||||||
|
|
||||||
|
|
||||||
Core API functionalities can be used through a CLI.
|
Core API functionalities can be used through a CLI.
|
||||||
|
|
||||||
|
### Table Parsing
|
||||||
#### Table Parsing
|
|
||||||
|
|
||||||
The tables parsing utility detects and segments tables into individual cells.
|
The tables parsing utility detects and segments tables into individual cells.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
||||||
```
|
```
|
||||||
|
|
||||||
The below image shows a parsed table, where each table cell has been detected individually.
|
The below image shows a parsed table, where each table cell has been detected individually.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
### Redaction Detection (CLI)
|
||||||
#### Redaction Detection
|
|
||||||
|
|
||||||
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||||
```
|
```
|
||||||
|
|
||||||
The below image shows the detected redactions with green outlines.
|
The below image shows the detected redactions with green outlines.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
### Layout Parsing
|
||||||
#### Layout Parsing
|
|
||||||
|
|
||||||
The layout parsing utility detects elements such as paragraphs, tables and figures.
|
The layout parsing utility detects elements such as paragraphs, tables and figures.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
|
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
|
||||||
```
|
```
|
||||||
|
|
||||||
The below image shows the detected layout elements on a page.
|
The below image shows the detected layout elements on a page.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
### Figure Detection
|
||||||
#### Figure Detection
|
|
||||||
|
|
||||||
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
|
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
|
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
|
||||||
```
|
```
|
||||||
|
|
||||||
The below image shows the detected figure on a page.
|
The below image shows the detected figure on a page.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
## Running as a service
|
## Running as a service
|
||||||
|
|
||||||
### Building
|
### Building
|
||||||
|
|
||||||
Build base image
|
Build base image
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
bash setup/docker.sh
|
bash setup/docker.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Build head image
|
Build head image
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -f Dockerfile -t vidocp . --build-arg BASE_ROOT=""
|
docker build -f Dockerfile -t vidocp . --build-arg BASE_ROOT=""
|
||||||
```
|
```
|
||||||
|
|
||||||
### Usage
|
### Usage (service)
|
||||||
|
|
||||||
Shell 1
|
Shell 1
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@ device: cpu
|
|||||||
service:
|
service:
|
||||||
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for log file messages
|
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for log file messages
|
||||||
logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log)
|
logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log)
|
||||||
|
monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not
|
||||||
|
|
||||||
webserver:
|
webserver:
|
||||||
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
||||||
@ -9,9 +10,11 @@ webserver:
|
|||||||
mode: $SERVER_MODE|production # webserver mode: {development, production}
|
mode: $SERVER_MODE|production # webserver mode: {development, production}
|
||||||
|
|
||||||
deskew:
|
deskew:
|
||||||
|
function: identity # function to use: {hist: deskew_histbased, identity: <identity mapping>}
|
||||||
preprocess: True
|
preprocess: True
|
||||||
max_abs_angle: 1.5
|
max_abs_angle: 1.5
|
||||||
delta: 0.15
|
delta: 0.1
|
||||||
|
test_delta: 0.15
|
||||||
mode: nearest
|
mode: nearest
|
||||||
verbose: False
|
verbose: False
|
||||||
filter_strength_h: 3
|
filter_strength_h: 3
|
||||||
@ -14,3 +14,5 @@ pytest~=6.2
|
|||||||
envyaml~=1.8
|
envyaml~=1.8
|
||||||
coverage~=5.5
|
coverage~=5.5
|
||||||
dependency-check~=0.6.0
|
dependency-check~=0.6.0
|
||||||
|
prometheus-client~=0.13.1
|
||||||
|
prometheus_flask_exporter~=0.19.0
|
||||||
@ -1,6 +1,7 @@
|
|||||||
# python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing
|
# python client_mock.py --pdf_path=/home/iriley/Documents/pdfs/unscanned/06.pdf --operations=table-parsing
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
from multiprocessing.sharedctypes import Value
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from vidocp.utils.preprocessing import open_pdf
|
from vidocp.utils.preprocessing import open_pdf
|
||||||
@ -40,7 +41,21 @@ def main(args):
|
|||||||
# {"operations": args.operations.split(",")}
|
# {"operations": args.operations.split(",")}
|
||||||
# )
|
# )
|
||||||
# }
|
# }
|
||||||
|
operations = args.operations.split(",")
|
||||||
|
for operation in operations:
|
||||||
|
print("****************************")
|
||||||
|
print(f"{' '+operation+' ':^27}")
|
||||||
|
print("****************************")
|
||||||
|
if operation == "table-parsing":
|
||||||
response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
|
response = requests.post("http://127.0.0.1:5000/tables", data=open(args.pdf_path, "rb"))
|
||||||
|
elif operation == "redaction-detection":
|
||||||
|
response = requests.post("http://127.0.0.1:5000/redactions", data=open(args.pdf_path, "rb"))
|
||||||
|
elif operation == "figure-detection":
|
||||||
|
response = requests.post("http://127.0.0.1:5000/figures", data=open(args.pdf_path, "rb"))
|
||||||
|
elif operation == "layout-parsing":
|
||||||
|
response = requests.post("http://127.0.0.1:5000/layout", data=open(args.pdf_path, "rb"))
|
||||||
|
else:
|
||||||
|
raise ValueError("{args.operation} is not a valid value.")
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
predictions = response.json()
|
predictions = response.json()
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,11 @@
|
|||||||
import argparse
|
|
||||||
import json
|
import json
|
||||||
|
import tracemalloc
|
||||||
|
from sys import getsizeof
|
||||||
import logging
|
import logging
|
||||||
from typing import List
|
from typing import List
|
||||||
from flask import Flask, request, jsonify
|
from flask import Flask, request, jsonify
|
||||||
|
from prometheus_client import Counter, Gauge
|
||||||
|
from prometheus_flask_exporter import PrometheusMetrics
|
||||||
from waitress import serve
|
from waitress import serve
|
||||||
|
|
||||||
from vidocp.utils import npconvert
|
from vidocp.utils import npconvert
|
||||||
@ -27,23 +30,52 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
def run_server():
|
def run_server():
|
||||||
|
file_counter = Counter("vidocp_file_counter", "count processed files")
|
||||||
|
#page_counter = Counter("vidocp_page_counter", "count pages from processed files")
|
||||||
|
ram_metric = Gauge("vidocp_memory_usage", "Memory usage in Mb")
|
||||||
|
|
||||||
|
def start_monitoring():
|
||||||
|
file_counter.inc()
|
||||||
|
_, peak = tracemalloc.get_traced_memory()
|
||||||
|
ram_metric.set(peak / 10 ** 6)
|
||||||
|
|
||||||
|
logger.info(make_art())
|
||||||
|
tracemalloc.start()
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
metrics = PrometheusMetrics(app=app, path='/prometheus')
|
||||||
|
|
||||||
@app.route("/tables", methods=["POST"])
|
@app.route("/tables", methods=["POST"])
|
||||||
|
@metrics.summary('tables_request_time_seconds', 'Time spent processing tables request')
|
||||||
def get_tables():
|
def get_tables():
|
||||||
return annotate("tables")
|
start_monitoring()
|
||||||
|
tables = annotate("tables")
|
||||||
|
#page_counter.inc(npages)
|
||||||
|
return tables
|
||||||
|
|
||||||
@app.route("/redactions", methods=["POST"])
|
@app.route("/redactions", methods=["POST"])
|
||||||
|
@metrics.summary('redactions_request_time_seconds', 'Time spent processing redaction request')
|
||||||
def get_redactions():
|
def get_redactions():
|
||||||
return annotate("redactions")
|
start_monitoring()
|
||||||
|
redactions = annotate("redactions")
|
||||||
|
#page_counter.inc(npages)
|
||||||
|
return redactions
|
||||||
|
|
||||||
@app.route("/figures", methods=["POST"])
|
@app.route("/figures", methods=["POST"])
|
||||||
|
@metrics.summary('figures_request_time_seconds', 'Time spent processing figures request')
|
||||||
def get_figures():
|
def get_figures():
|
||||||
return annotate("figures")
|
start_monitoring()
|
||||||
|
figures = annotate("figures")
|
||||||
|
#page_counter.inc(npages)
|
||||||
|
return figures
|
||||||
|
|
||||||
@app.route("/layout", methods=["POST"])
|
@app.route("/layout", methods=["POST"])
|
||||||
|
@metrics.summary('layout_request_time_seconds', 'Time spent processing layout request')
|
||||||
def get_layout():
|
def get_layout():
|
||||||
return annotate("layout")
|
start_monitoring()
|
||||||
|
layout = annotate("layout")
|
||||||
|
#page_counter.inc(npages)
|
||||||
|
return layout
|
||||||
|
|
||||||
@app.route("/status", methods=["GET"])
|
@app.route("/status", methods=["GET"])
|
||||||
def status():
|
def status():
|
||||||
@ -59,6 +91,7 @@ def run_server():
|
|||||||
elif mode == "production":
|
elif mode == "production":
|
||||||
serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
|
serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
|
||||||
logging.info("Production.")
|
logging.info("Production.")
|
||||||
|
tracemalloc.stop()
|
||||||
|
|
||||||
|
|
||||||
def apply_annotation_function(annotation_function, page_list):
|
def apply_annotation_function(annotation_function, page_list):
|
||||||
@ -70,9 +103,7 @@ def apply_annotation_function(annotation_function, page_list):
|
|||||||
return outdict
|
return outdict
|
||||||
|
|
||||||
|
|
||||||
def make_annotations(pdf_data, task):
|
def make_annotations(pdf, task):
|
||||||
pdf = open_pdf(pdf_data)
|
|
||||||
|
|
||||||
if task == "tables":
|
if task == "tables":
|
||||||
annotation = {"tables": apply_annotation_function(parse_table, pdf)}
|
annotation = {"tables": apply_annotation_function(parse_table, pdf)}
|
||||||
elif task == "redactions":
|
elif task == "redactions":
|
||||||
@ -88,13 +119,19 @@ def make_annotations(pdf_data, task):
|
|||||||
return json.dumps(annotation, default=npconvert)
|
return json.dumps(annotation, default=npconvert)
|
||||||
|
|
||||||
|
|
||||||
|
def get_size(data):
|
||||||
|
return round(getsizeof(data) / 1000000, 2)
|
||||||
|
|
||||||
|
|
||||||
def annotate(task):
|
def annotate(task):
|
||||||
def inner():
|
def inner():
|
||||||
data = request.data
|
data = request.data
|
||||||
logger.info(f"<3 Received data.")
|
logger.info(f"<3 Received data.")
|
||||||
logger.info(f"Processing data. <3")
|
logger.info(f"Processing data. <3")
|
||||||
annotations = make_annotations(data, task)
|
pdf, angles = open_pdf(data)
|
||||||
return jsonify({"result": annotations})
|
#npages = len(pdf)
|
||||||
|
annotations = make_annotations(pdf, task)
|
||||||
|
return jsonify({"result": annotations, "deskew_angles": angles})
|
||||||
try:
|
try:
|
||||||
return inner()
|
return inner()
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
@ -105,5 +142,24 @@ def annotate(task):
|
|||||||
return resp
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
def make_art():
|
||||||
|
art = """
|
||||||
|
|
||||||
|
=================================================================================================
|
||||||
|
== ==== ============== ================= ==========================================
|
||||||
|
== ==== ============== ==== ================ ==== =========================================
|
||||||
|
== ==== ============== ==== ================ ==== =========================================
|
||||||
|
== ==== == == == ==== === ==== === ==== === === = ==== ==== === = ===
|
||||||
|
== == ========== == ==== == == = == === = == = == = == = == = ==
|
||||||
|
=== == === ===== === ==== == = == ===== =========== == ======== ==== == =======
|
||||||
|
=== == === ==== ==== ==== == = == ===== ========= == ========= === ===== =======
|
||||||
|
==== ==== === ===== ==== == = == = == ======== = == ======= = == = == =======
|
||||||
|
===== ===== == == ==== ==== === ========= == ======== ==== === =======
|
||||||
|
=================================================================================================
|
||||||
|
|
||||||
|
"""
|
||||||
|
return art
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@ -27,10 +27,13 @@ def find_redactions(image: np.array, min_normalized_area=200000):
|
|||||||
|
|
||||||
contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
|
contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
|
||||||
|
|
||||||
|
try:
|
||||||
contours = map(
|
contours = map(
|
||||||
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
|
first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0]))
|
||||||
)
|
)
|
||||||
return list(contours)
|
return list(contours)
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):
|
def annotate_redactions_in_pdf(pdf_path, page_index=1, show=True):
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.ndimage import rotate
|
from scipy.ndimage import rotate as rotate_
|
||||||
import cv2
|
import cv2
|
||||||
|
|
||||||
from vidocp.config import CONFIG
|
from vidocp.config import CONFIG
|
||||||
@ -14,7 +14,7 @@ def rotate_straight(im: np.array, skew_angle: int) -> np.array:
|
|||||||
|
|
||||||
|
|
||||||
def find_score(arr, angle):
|
def find_score(arr, angle):
|
||||||
data = rotate(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
|
data = rotate_(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
|
||||||
hist = np.sum(data, axis=1)
|
hist = np.sum(data, axis=1)
|
||||||
score = np.sum((hist[1:] - hist[:-1]) ** 2)
|
score = np.sum((hist[1:] - hist[:-1]) ** 2)
|
||||||
return score
|
return score
|
||||||
@ -36,12 +36,47 @@ def preprocess(arr: np.array):
|
|||||||
return arr
|
return arr
|
||||||
|
|
||||||
|
|
||||||
|
def rotate(page, angle):
|
||||||
|
rotated = rotate_(page, angle, reshape=False, order=0, mode="nearest")
|
||||||
|
return rotated
|
||||||
|
|
||||||
|
|
||||||
def deskew_histbased(page: np.array):
|
def deskew_histbased(page: np.array):
|
||||||
page = preprocess(page)
|
page = preprocess(page)
|
||||||
best_angle = find_best_angle(page)
|
best_angle = round(find_best_angle(page), 3)
|
||||||
|
|
||||||
if CONFIG.deskew.verbose:
|
if CONFIG.deskew.verbose:
|
||||||
print("Skew angle from pixel histogram: {}".format(best_angle))
|
print("Skew angle from pixel histogram: {}".format(best_angle))
|
||||||
|
|
||||||
rotated = rotate(page, best_angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
|
rotated = rotate(page, best_angle)
|
||||||
return rotated, best_angle
|
return (rotated, best_angle)
|
||||||
|
|
||||||
|
|
||||||
|
def needs_deskew(page: np.array) -> bool:
|
||||||
|
"""
|
||||||
|
Makes use of 'row-wise mean difference' - the difference between neighboring - on left and right halves
|
||||||
|
"""
|
||||||
|
|
||||||
|
def split_rowmean_diff(page):
|
||||||
|
width = page.shape[1]
|
||||||
|
cutpoint = int(width / 2)
|
||||||
|
left = page[:, :cutpoint]
|
||||||
|
right = page[:, cutpoint:]
|
||||||
|
leftmeans = np.mean(left, axis=1)
|
||||||
|
rightmeans = np.mean(right, axis=1)
|
||||||
|
return rightmeans - leftmeans
|
||||||
|
|
||||||
|
unrotated_score = np.mean(np.abs(split_rowmean_diff(page)))
|
||||||
|
angles = [-CONFIG.deskew.test_delta, CONFIG.deskew.test_delta]
|
||||||
|
scores = [np.mean(np.abs(split_rowmean_diff(rotate(page, angle)))) for angle in angles]
|
||||||
|
print(unrotated_score, scores)
|
||||||
|
return unrotated_score > min(scores)
|
||||||
|
|
||||||
|
|
||||||
|
print(CONFIG)
|
||||||
|
if CONFIG.deskew.function == "hist":
|
||||||
|
deskew = lambda page: deskew_histbased(page) if needs_deskew(page) else (page, 0)
|
||||||
|
elif CONFIG.deskew.function == "identity":
|
||||||
|
deskew = lambda page: (page, None)
|
||||||
|
else:
|
||||||
|
raise ValueError("'{CONFIG.deskew.function}' is not a valid parameter value for CONFIG.deskew.function")
|
||||||
@ -2,12 +2,14 @@ from numpy import array
|
|||||||
import pdf2image
|
import pdf2image
|
||||||
import cv2
|
import cv2
|
||||||
|
|
||||||
|
from vidocp.utils.deskew import deskew
|
||||||
|
|
||||||
|
|
||||||
def preprocess_pdf_image(page):
|
def preprocess_pdf_image(page):
|
||||||
if len(page.shape) > 2:
|
if len(page.shape) > 2:
|
||||||
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
|
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
|
||||||
page = cv2.fastNlMeansDenoising(page, h=3)
|
page = cv2.fastNlMeansDenoising(page, h=3)
|
||||||
return page
|
return deskew(page)
|
||||||
|
|
||||||
|
|
||||||
def open_pdf(pdf, first_page=0, last_page=None):
|
def open_pdf(pdf, first_page=0, last_page=None):
|
||||||
@ -20,4 +22,4 @@ def open_pdf(pdf, first_page=0, last_page=None):
|
|||||||
elif type(pdf) == list:
|
elif type(pdf) == list:
|
||||||
return pdf
|
return pdf
|
||||||
pages = [preprocess_pdf_image(array(p)) for p in pages]
|
pages = [preprocess_pdf_image(array(p)) for p in pages]
|
||||||
return pages
|
return list(zip(*pages))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user