Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into fig-detection-scanned-pdfs
Conflicts: cv_analysis/figure_detection.py cv_analysis/layout_parsing.py cv_analysis/table_parsing.py scripts/annotate.py
This commit is contained in:
commit
0e30e97f80
@ -7,6 +7,7 @@ WORKDIR /app/service
|
||||
|
||||
COPY ./src ./src
|
||||
COPY cv_analysis ./cv_analysis
|
||||
COPY config.yaml ./config.yaml
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install -e .
|
||||
|
||||
@ -23,5 +23,5 @@ deskew:
|
||||
test_dummy: test_dummy
|
||||
|
||||
visual_logging:
|
||||
level: $LOGGING_LEVEL_ROOT|DEBUG
|
||||
level: $LOGGING_LEVEL_ROOT|INFO
|
||||
output_folder: /tmp/debug/
|
||||
@ -9,7 +9,7 @@ from cv_analysis.utils.post_processing import remove_included
|
||||
from cv_analysis.utils.filters import is_large_enough, has_acceptable_format
|
||||
from cv_analysis.utils.text import remove_primary_text_regions
|
||||
from cv_analysis.utils.visual_logging import vizlogger
|
||||
#from PIL import Image
|
||||
|
||||
|
||||
def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
|
||||
return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio)
|
||||
|
||||
@ -51,4 +51,3 @@ def annotate_redactions_in_pdf(pdf_path, page_index=1, show=False):
|
||||
|
||||
if show:
|
||||
show_mpl(page)
|
||||
|
||||
@ -16,10 +16,8 @@ from cv_analysis.utils.visual_logging import vizlogger
|
||||
from cv_analysis.layout_parsing import parse_layout
|
||||
|
||||
|
||||
def add_external_contours(image, contour_source_image):
|
||||
contours, _ = cv2.findContours(contour_source_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
contours = filter(partial(is_large_enough, min_area=5000), contours)
|
||||
|
||||
def add_external_contours(image, img):
|
||||
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
for cnt in contours:
|
||||
x, y, w, h = cv2.boundingRect(cnt)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||
@ -80,7 +78,7 @@ def isolate_vertical_and_horizontal_components(img_bin):
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
img_lines_raw = img_bin_v | img_bin_h
|
||||
vizlogger.debug(img_lines_raw, "tables02_isolate02_img_bin_v.png")
|
||||
|
||||
|
||||
kernel_h = np.ones((1, 30), np.uint8)
|
||||
kernel_v = np.ones((30, 1), np.uint8)
|
||||
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
|
||||
@ -100,7 +98,7 @@ def isolate_vertical_and_horizontal_components(img_bin):
|
||||
vizlogger.debug(img_bin_final, "tables10_isolate12_threshold.png")
|
||||
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
|
||||
vizlogger.debug(img_bin_final, "tables11_isolate13_dilate.png")
|
||||
|
||||
|
||||
# add contours before lines are extended by blurring
|
||||
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
|
||||
vizlogger.debug(img_bin_final, "tables11_isolate14_contours_added.png")
|
||||
@ -174,7 +172,7 @@ def parse_table(image: np.array, show=False):
|
||||
stats = np.vstack(list(filter(is_large_enough, stats)))
|
||||
rects = stats[:, :-1][2:]
|
||||
|
||||
return list(rects)
|
||||
return list(map(list, rects))
|
||||
|
||||
|
||||
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
|
||||
@ -189,6 +187,8 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
|
||||
stats = parse_table(page)
|
||||
page = draw_rectangles(page, stats, annotate=True)
|
||||
vizlogger.debug(page, "tables15_final_output.png")
|
||||
if show:
|
||||
show_mpl(page)
|
||||
|
||||
|
||||
def tables_in_image(cropped_image):
|
||||
|
||||
@ -1,406 +1,62 @@
|
||||
{
|
||||
"0": [
|
||||
[
|
||||
211,
|
||||
415,
|
||||
367,
|
||||
29
|
||||
],
|
||||
[
|
||||
581,
|
||||
415,
|
||||
417,
|
||||
29
|
||||
],
|
||||
[
|
||||
1001,
|
||||
415,
|
||||
406,
|
||||
29
|
||||
],
|
||||
[
|
||||
211,
|
||||
447,
|
||||
367,
|
||||
47
|
||||
],
|
||||
[
|
||||
581,
|
||||
447,
|
||||
417,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
447,
|
||||
406,
|
||||
47
|
||||
],
|
||||
[
|
||||
211,
|
||||
497,
|
||||
367,
|
||||
47
|
||||
],
|
||||
[
|
||||
580,
|
||||
497,
|
||||
418,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
497,
|
||||
406,
|
||||
47
|
||||
],
|
||||
[
|
||||
211,
|
||||
547,
|
||||
367,
|
||||
47
|
||||
],
|
||||
[
|
||||
580,
|
||||
547,
|
||||
418,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
547,
|
||||
406,
|
||||
47
|
||||
],
|
||||
[
|
||||
211,
|
||||
597,
|
||||
367,
|
||||
47
|
||||
],
|
||||
[
|
||||
581,
|
||||
597,
|
||||
417,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
597,
|
||||
406,
|
||||
48
|
||||
],
|
||||
[
|
||||
212,
|
||||
647,
|
||||
366,
|
||||
48
|
||||
],
|
||||
[
|
||||
581,
|
||||
647,
|
||||
417,
|
||||
48
|
||||
],
|
||||
[
|
||||
1001,
|
||||
647,
|
||||
406,
|
||||
48
|
||||
],
|
||||
[
|
||||
581,
|
||||
697,
|
||||
417,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
697,
|
||||
407,
|
||||
48
|
||||
],
|
||||
[
|
||||
212,
|
||||
698,
|
||||
366,
|
||||
47
|
||||
],
|
||||
[
|
||||
211,
|
||||
747,
|
||||
367,
|
||||
48
|
||||
],
|
||||
[
|
||||
581,
|
||||
747,
|
||||
417,
|
||||
48
|
||||
],
|
||||
[
|
||||
1001,
|
||||
748,
|
||||
407,
|
||||
47
|
||||
],
|
||||
[
|
||||
211,
|
||||
798,
|
||||
367,
|
||||
47
|
||||
],
|
||||
[
|
||||
581,
|
||||
798,
|
||||
417,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
798,
|
||||
407,
|
||||
47
|
||||
],
|
||||
[
|
||||
212,
|
||||
848,
|
||||
366,
|
||||
47
|
||||
],
|
||||
[
|
||||
581,
|
||||
848,
|
||||
417,
|
||||
47
|
||||
],
|
||||
[
|
||||
1001,
|
||||
848,
|
||||
407,
|
||||
48
|
||||
],
|
||||
[
|
||||
212,
|
||||
898,
|
||||
366,
|
||||
48
|
||||
],
|
||||
[
|
||||
581,
|
||||
898,
|
||||
417,
|
||||
48
|
||||
],
|
||||
[
|
||||
1001,
|
||||
898,
|
||||
407,
|
||||
48
|
||||
],
|
||||
[
|
||||
212,
|
||||
949,
|
||||
366,
|
||||
33
|
||||
],
|
||||
[
|
||||
581,
|
||||
949,
|
||||
827,
|
||||
33
|
||||
],
|
||||
[
|
||||
462,
|
||||
1163,
|
||||
368,
|
||||
29
|
||||
],
|
||||
[
|
||||
833,
|
||||
1163,
|
||||
404,
|
||||
29
|
||||
],
|
||||
[
|
||||
462,
|
||||
1195,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
833,
|
||||
1195,
|
||||
404,
|
||||
48
|
||||
],
|
||||
[
|
||||
462,
|
||||
1245,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
833,
|
||||
1245,
|
||||
404,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1296,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
833,
|
||||
1296,
|
||||
404,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1346,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
833,
|
||||
1346,
|
||||
404,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1396,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
834,
|
||||
1396,
|
||||
403,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1446,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
833,
|
||||
1446,
|
||||
404,
|
||||
48
|
||||
],
|
||||
[
|
||||
462,
|
||||
1496,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
833,
|
||||
1496,
|
||||
404,
|
||||
48
|
||||
],
|
||||
[
|
||||
462,
|
||||
1547,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
834,
|
||||
1547,
|
||||
403,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1597,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
834,
|
||||
1597,
|
||||
403,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1647,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
833,
|
||||
1647,
|
||||
404,
|
||||
48
|
||||
],
|
||||
[
|
||||
462,
|
||||
1698,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
833,
|
||||
1698,
|
||||
404,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1748,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
834,
|
||||
1748,
|
||||
403,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1798,
|
||||
368,
|
||||
47
|
||||
],
|
||||
[
|
||||
834,
|
||||
1798,
|
||||
403,
|
||||
47
|
||||
],
|
||||
[
|
||||
462,
|
||||
1848,
|
||||
368,
|
||||
48
|
||||
],
|
||||
[
|
||||
834,
|
||||
1848,
|
||||
403,
|
||||
48
|
||||
],
|
||||
[
|
||||
462,
|
||||
1899,
|
||||
369,
|
||||
33
|
||||
],
|
||||
[
|
||||
832,
|
||||
1899,
|
||||
405,
|
||||
33
|
||||
]
|
||||
[211, 447, 367, 47],
|
||||
[581, 447, 417, 47],
|
||||
[1001, 447, 406, 47],
|
||||
[211, 497, 367, 47],
|
||||
[580, 497, 418, 47],
|
||||
[1001, 497, 406, 47],
|
||||
[211, 547, 367, 47],
|
||||
[580, 547, 418, 47],
|
||||
[1001, 547, 406, 47],
|
||||
[211, 597, 367, 47],
|
||||
[581, 597, 417, 47],
|
||||
[1001, 597, 406, 48],
|
||||
[212, 647, 366, 48],
|
||||
[581, 647, 417, 48],
|
||||
[1001, 647, 406, 48],
|
||||
[581, 697, 417, 47],
|
||||
[1001, 697, 407, 48],
|
||||
[212, 698, 366, 47],
|
||||
[211, 747, 367, 48],
|
||||
[581, 747, 417, 48],
|
||||
[1001, 748, 407, 47],
|
||||
[211, 798, 367, 47],
|
||||
[581, 798, 417, 47],
|
||||
[1001, 798, 407, 47],
|
||||
[212, 848, 366, 47],
|
||||
[581, 848, 417, 47],
|
||||
[1001, 848, 407, 48],
|
||||
[212, 898, 366, 48],
|
||||
[581, 898, 417, 48],
|
||||
[1001, 898, 407, 48],
|
||||
[462, 1195, 368, 48],
|
||||
[833, 1195, 404, 48],
|
||||
[462, 1245, 368, 48],
|
||||
[833, 1245, 404, 47],
|
||||
[462, 1296, 368, 47],
|
||||
[833, 1296, 404, 47],
|
||||
[462, 1346, 368, 47],
|
||||
[833, 1346, 404, 47],
|
||||
[462, 1396, 368, 47],
|
||||
[834, 1396, 403, 47],
|
||||
[462, 1446, 368, 48],
|
||||
[833, 1446, 404, 48],
|
||||
[462, 1496, 368, 48],
|
||||
[833, 1496, 404, 48],
|
||||
[462, 1547, 368, 47],
|
||||
[834, 1547, 403, 47],
|
||||
[462, 1597, 368, 48],
|
||||
[834, 1597, 403, 47],
|
||||
[462, 1647, 368, 48],
|
||||
[833, 1647, 404, 48],
|
||||
[462, 1698, 368, 47],
|
||||
[833, 1698, 404, 47],
|
||||
[462, 1748, 368, 47],
|
||||
[834, 1748, 403, 47],
|
||||
[462, 1798, 368, 47],
|
||||
[834, 1798, 403, 47],
|
||||
[462, 1848, 368, 48],
|
||||
[834, 1848, 403, 48]
|
||||
]
|
||||
}
|
||||
@ -2,21 +2,21 @@ import os
|
||||
from cv_analysis.config import CONFIG
|
||||
from cv_analysis.utils.display import save_mpl
|
||||
|
||||
LEVEL = CONFIG.visual_logging.level
|
||||
OUTPUT_FOLDER = CONFIG.visual_logging.output_folder
|
||||
|
||||
|
||||
class VisualLogger:
|
||||
def __init__(self):
|
||||
self.level_is_debug = LEVEL == "DEBUG"
|
||||
self.output_folder = OUTPUT_FOLDER
|
||||
def __init__(self, level, output_folder):
|
||||
self.level = level
|
||||
self.output_folder = output_folder
|
||||
if not os.path.exists(self.output_folder):
|
||||
os.mkdir(self.output_folder)
|
||||
|
||||
def debug(self, img, name):
|
||||
if self.level_is_debug:
|
||||
if self.level_is_debug():
|
||||
output_path = os.path.join(self.output_folder, name)
|
||||
save_mpl(img, output_path)
|
||||
|
||||
def level_is_debug(self):
|
||||
return self.level == "DEBUG"
|
||||
|
||||
|
||||
vizlogger = VisualLogger()
|
||||
vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder)
|
||||
|
||||
@ -34,7 +34,7 @@ def parse_args():
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
|
||||
operations = args.operations.split(",")
|
||||
for operation in operations:
|
||||
print("****************************")
|
||||
@ -49,7 +49,7 @@ def main(args):
|
||||
elif operation == "layout-parsing":
|
||||
response = requests.post("http://127.0.0.1:5000/layout", data=open(args.pdf_path, "rb"))
|
||||
else:
|
||||
raise ValueError("{args.operation} is not a valid value.")
|
||||
raise ValueError(f"{args.operation} is not a valid value.")
|
||||
response.raise_for_status()
|
||||
predictions = response.json()
|
||||
|
||||
|
||||
@ -8,11 +8,10 @@ from prometheus_flask_exporter import PrometheusMetrics
|
||||
from waitress import serve
|
||||
|
||||
from cv_analysis.utils import npconvert
|
||||
from cv_analysis.utils.preprocessing import preprocess_pdf_image # TODO
|
||||
from cv_analysis.table_parsing import parse_table # , detect_tables_in_pdf
|
||||
from cv_analysis.redaction_detection import find_redactions # , detect_redactions_in_pdf
|
||||
from cv_analysis.layout_parsing import parse_layout # , detect_layout_in_pdf #TODO
|
||||
from cv_analysis.figure_detection import detect_figures # , detect_figures_in_pdf #TODO
|
||||
from cv_analysis.table_parsing import parse_table
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
from cv_analysis.layout_parsing import parse_layout
|
||||
from cv_analysis.figure_detection import detect_figures
|
||||
from cv_analysis.utils.logging import logger
|
||||
from cv_analysis.utils.preprocessing import open_pdf
|
||||
from cv_analysis.config import CONFIG
|
||||
@ -26,7 +25,6 @@ def suppress_user_warnings():
|
||||
|
||||
def main():
|
||||
file_counter = Counter("cv_analysis_file_counter", "count processed files")
|
||||
# page_counter = Counter("cv_analysis_page_counter", "count pages from processed files")
|
||||
ram_metric = Gauge("cv_analysis_memory_usage", "Memory usage in Mb")
|
||||
|
||||
def start_monitoring():
|
||||
@ -44,32 +42,28 @@ def main():
|
||||
@metrics.summary("tables_request_time_seconds", "Time spent processing tables request")
|
||||
def get_tables():
|
||||
start_monitoring()
|
||||
tables = annotate("tables")
|
||||
# page_counter.inc(npages)
|
||||
tables = annotate(parse_table)
|
||||
return tables
|
||||
|
||||
@app.route("/redactions", methods=["POST"])
|
||||
@metrics.summary("redactions_request_time_seconds", "Time spent processing redaction request")
|
||||
def get_redactions():
|
||||
start_monitoring()
|
||||
redactions = annotate("redactions")
|
||||
# page_counter.inc(npages)
|
||||
redactions = annotate(find_redactions)
|
||||
return redactions
|
||||
|
||||
@app.route("/figures", methods=["POST"])
|
||||
@metrics.summary("figures_request_time_seconds", "Time spent processing figures request")
|
||||
def get_figures():
|
||||
start_monitoring()
|
||||
figures = annotate("figures")
|
||||
# page_counter.inc(npages)
|
||||
figures = annotate(detect_figures)
|
||||
return figures
|
||||
|
||||
@app.route("/layout", methods=["POST"])
|
||||
@metrics.summary("layout_request_time_seconds", "Time spent processing layout request")
|
||||
def get_layout():
|
||||
start_monitoring()
|
||||
layout = annotate("layout")
|
||||
# page_counter.inc(npages)
|
||||
layout = annotate(parse_layout)
|
||||
return layout
|
||||
|
||||
@app.route("/status", methods=["GET"])
|
||||
@ -77,7 +71,6 @@ def main():
|
||||
response = "OK"
|
||||
return jsonify(response)
|
||||
|
||||
# predictor = initialize_predictor()
|
||||
logger.info("<3 Annotator ready.")
|
||||
|
||||
mode = CONFIG.webserver.mode
|
||||
@ -89,46 +82,30 @@ def main():
|
||||
tracemalloc.stop()
|
||||
|
||||
|
||||
def apply_annotation_function(annotation_function, page_list):
|
||||
outdict = {}
|
||||
for i, page in enumerate(page_list):
|
||||
results = annotation_function(page)
|
||||
if results:
|
||||
outdict.update({i: results})
|
||||
return outdict
|
||||
|
||||
|
||||
def make_annotations(pdf, task):
|
||||
if task == "tables":
|
||||
annotation = {"tables": apply_annotation_function(parse_table, pdf)}
|
||||
elif task == "redactions":
|
||||
annotation = {"redactions": apply_annotation_function(find_redactions, pdf)}
|
||||
elif task == "figures":
|
||||
annotation = {"figures": apply_annotation_function(detect_figures, pdf)}
|
||||
elif task == "layout":
|
||||
annotation = {"layout": apply_annotation_function(parse_layout, pdf)}
|
||||
else:
|
||||
raise ValueError(
|
||||
f"'{task}' is not a valid operation keyword. Valid values include: \
|
||||
\ntables\nredactions\nfigures\nlayout\n"
|
||||
)
|
||||
|
||||
return json.dumps(annotation, default=npconvert)
|
||||
def make_annotations(pdf, annotation_function):
|
||||
results = []
|
||||
for i, page in enumerate(pdf):
|
||||
boxes = annotation_function(page)
|
||||
cells = [{"x": x, "y": y, "width": w, "height": h} for x, y, w, h in boxes]
|
||||
results.append({"page": i, "pageWidth": page.shape[1], "pageHeight": page.shape[0], "cells": cells})
|
||||
output_dict = {"pages": results}
|
||||
return jsonify(json.dumps(output_dict, default=npconvert))
|
||||
|
||||
|
||||
def get_size(data):
|
||||
return round(getsizeof(data) / 1000000, 2)
|
||||
|
||||
|
||||
def annotate(task):
|
||||
def annotate(annotation_function):
|
||||
def inner():
|
||||
data = request.data
|
||||
logger.info(f"Received data.")
|
||||
logger.info(f"Processing data.")
|
||||
pdf, angles = open_pdf(data)
|
||||
# npages = len(pdf)
|
||||
annotations = make_annotations(pdf, task)
|
||||
return jsonify({"result": annotations, "deskew_angles": angles})
|
||||
annotations = make_annotations(pdf, annotation_function)
|
||||
# if CONFIG.deskew.function != "identity":
|
||||
# annotations.update({"deskew_angles": angles})
|
||||
return annotations
|
||||
|
||||
try:
|
||||
return inner()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user