From 9327fb7231a7097b2e57d2cefd82fe4c8b1ffeeb Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Fri, 22 Apr 2022 11:22:16 +0200 Subject: [PATCH 1/5] fixed json format and refactored service functions --- Dockerfile | 1 + config.yaml | 2 +- scripts/client_mock.py | 2 +- src/run_service.py | 68 +++++++++++++++++++----------------------- 4 files changed, 33 insertions(+), 40 deletions(-) diff --git a/Dockerfile b/Dockerfile index 054c5d9..19f3b04 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,7 @@ WORKDIR /app/service COPY ./src ./src COPY cv_analysis ./cv_analysis +COPY config.yaml ./config.yaml RUN python3 -m pip install --upgrade pip RUN python3 -m pip install -e . diff --git a/config.yaml b/config.yaml index fc6bb42..42bd2e7 100644 --- a/config.yaml +++ b/config.yaml @@ -23,5 +23,5 @@ deskew: test_dummy: test_dummy visual_logging: - level: $LOGGING_LEVEL_ROOT|DEBUG + level: $LOGGING_LEVEL_ROOT|INFO output_folder: /tmp/debug/ \ No newline at end of file diff --git a/scripts/client_mock.py b/scripts/client_mock.py index ffdd0ab..96ab9b4 100644 --- a/scripts/client_mock.py +++ b/scripts/client_mock.py @@ -49,7 +49,7 @@ def main(args): elif operation == "layout-parsing": response = requests.post("http://127.0.0.1:5000/layout", data=open(args.pdf_path, "rb")) else: - raise ValueError("{args.operation} is not a valid value.") + raise ValueError(f"{args.operation} is not a valid value.") response.raise_for_status() predictions = response.json() diff --git a/src/run_service.py b/src/run_service.py index 269c2f4..304e8d8 100644 --- a/src/run_service.py +++ b/src/run_service.py @@ -8,11 +8,10 @@ from prometheus_flask_exporter import PrometheusMetrics from waitress import serve from cv_analysis.utils import npconvert -from cv_analysis.utils.preprocessing import preprocess_pdf_image # TODO -from cv_analysis.table_parsing import parse_table # , detect_tables_in_pdf -from cv_analysis.redaction_detection import find_redactions # , detect_redactions_in_pdf -from cv_analysis.layout_parsing import parse_layout # , detect_layout_in_pdf #TODO -from cv_analysis.figure_detection import detect_figures # , detect_figures_in_pdf #TODO +from cv_analysis.table_parsing import parse_table +from cv_analysis.redaction_detection import find_redactions +from cv_analysis.layout_parsing import parse_layout +from cv_analysis.figure_detection import detect_figures from cv_analysis.utils.logging import logger from cv_analysis.utils.preprocessing import open_pdf from cv_analysis.config import CONFIG @@ -44,7 +43,7 @@ def main(): @metrics.summary("tables_request_time_seconds", "Time spent processing tables request") def get_tables(): start_monitoring() - tables = annotate("tables") + tables = annotate(parse_table) # page_counter.inc(npages) return tables @@ -52,7 +51,7 @@ def main(): @metrics.summary("redactions_request_time_seconds", "Time spent processing redaction request") def get_redactions(): start_monitoring() - redactions = annotate("redactions") + redactions = annotate(find_redactions) # page_counter.inc(npages) return redactions @@ -60,7 +59,7 @@ def main(): @metrics.summary("figures_request_time_seconds", "Time spent processing figures request") def get_figures(): start_monitoring() - figures = annotate("figures") + figures = annotate(detect_figures) # page_counter.inc(npages) return figures @@ -68,7 +67,7 @@ def main(): @metrics.summary("layout_request_time_seconds", "Time spent processing layout request") def get_layout(): start_monitoring() - layout = annotate("layout") + layout = annotate(parse_layout) # page_counter.inc(npages) return layout @@ -77,7 +76,6 @@ def main(): response = "OK" return jsonify(response) - # predictor = initialize_predictor() logger.info("<3 Annotator ready.") mode = CONFIG.webserver.mode @@ -89,46 +87,40 @@ def main(): tracemalloc.stop() -def apply_annotation_function(annotation_function, page_list): - outdict = {} - for i, page in enumerate(page_list): - results = annotation_function(page) - if results: - outdict.update({i: results}) - return outdict - -def make_annotations(pdf, task): - if task == "tables": - annotation = {"tables": apply_annotation_function(parse_table, pdf)} - elif task == "redactions": - annotation = {"redactions": apply_annotation_function(find_redactions, pdf)} - elif task == "figures": - annotation = {"figures": apply_annotation_function(detect_figures, pdf)} - elif task == "layout": - annotation = {"layout": apply_annotation_function(parse_layout, pdf)} - else: - raise ValueError( - f"'{task}' is not a valid operation keyword. Valid values include: \ - \ntables\nredactions\nfigures\nlayout\n" - ) - - return json.dumps(annotation, default=npconvert) +def make_annotations(pdf, annotation_function): + results = [] + for i, page in enumerate(pdf): + boxes = annotation_function(page) + cells= [] + if boxes: + cells = [{"x": x, "y": y, "width": w, "height": h} for x,y,w,h in boxes] + results.append({ + "page": i, + "pageWidth": page.shape[1], + "pageHeight": page.shape[0], + "cells": cells + }) + logger.info(str(results)) + logger.info(type(results)) + output_dict = {"pages": results} + return jsonify(json.dumps(output_dict, default=npconvert)) def get_size(data): return round(getsizeof(data) / 1000000, 2) -def annotate(task): +def annotate(annotation_function): def inner(): data = request.data logger.info(f"Received data.") logger.info(f"Processing data.") pdf, angles = open_pdf(data) - # npages = len(pdf) - annotations = make_annotations(pdf, task) - return jsonify({"result": annotations, "deskew_angles": angles}) + annotations = make_annotations(pdf, annotation_function) + #if CONFIG.deskew.function != "identity": + # annotations.update({"deskew_angles": angles}) + return annotations try: return inner() From 4ac1cce0e89f8baa84fafca3cc8c959ba3b8591a Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Tue, 26 Apr 2022 16:01:57 +0200 Subject: [PATCH 2/5] reformatting --- cv_analysis/figure_detection.py | 1 - cv_analysis/layout_parsing.py | 2 +- cv_analysis/redaction_detection.py | 1 - cv_analysis/table_parsing.py | 18 +++++++++--------- cv_analysis/utils/visual_logging.py | 16 ++++++++-------- scripts/annotate.py | 2 +- scripts/client_mock.py | 2 +- src/run_service.py | 14 +++----------- 8 files changed, 23 insertions(+), 33 deletions(-) diff --git a/cv_analysis/figure_detection.py b/cv_analysis/figure_detection.py index 4d50233..38f3f48 100644 --- a/cv_analysis/figure_detection.py +++ b/cv_analysis/figure_detection.py @@ -41,4 +41,3 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False): vizlogger.debug(page, "figures03_final.png") if show: show_mpl(page) - \ No newline at end of file diff --git a/cv_analysis/layout_parsing.py b/cv_analysis/layout_parsing.py index 2d6dcd3..290a3d0 100644 --- a/cv_analysis/layout_parsing.py +++ b/cv_analysis/layout_parsing.py @@ -86,7 +86,7 @@ def annotate_layout_in_pdf(pdf_path, page_index=1, show=False): if show: show_mpl(page) - + """ def find_layout_boxes(image: np.array): diff --git a/cv_analysis/redaction_detection.py b/cv_analysis/redaction_detection.py index f4fe7ca..e81ef53 100644 --- a/cv_analysis/redaction_detection.py +++ b/cv_analysis/redaction_detection.py @@ -51,4 +51,3 @@ def annotate_redactions_in_pdf(pdf_path, page_index=1, show=False): if show: show_mpl(page) - \ No newline at end of file diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index a6a6afd..11e64d2 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -26,12 +26,12 @@ def add_external_contours(image, img): def apply_motion_blur(image: np.array, angle, size=80): """Solidifies and slightly extends detected lines. - + Args: image (np.array): page image as array angle: direction in which to apply blur, 0 or 90 size (int): kernel size; 80 found empirically to work well - + Returns: np.array @@ -50,8 +50,8 @@ def apply_motion_blur(image: np.array, angle, size=80): def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): """Identifies and reinforces horizontal and vertical lines in a binary image. - - Args: + + Args: img_bin (np.array): array corresponding to single binarized page image bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables @@ -140,11 +140,11 @@ def preprocess(image: np.array): def parse_table(image: np.array, show=False): - """Runs the full table parsing process. - + """Runs the full table parsing process. + Args: image (np.array): single PDF page, opened as PIL.Image object and converted to a numpy array - + Returns: list: list of rectangles corresponding to table cells """ @@ -154,10 +154,10 @@ def parse_table(image: np.array, show=False): return area > 2000 and w > 35 and h > 25 image = preprocess(image) - + table_layout_boxes = find_table_layout_boxes(image) image = isolate_vertical_and_horizontal_components(image, table_layout_boxes) - + _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S) stats = np.vstack(list(filter(is_large_enough, stats))) diff --git a/cv_analysis/utils/visual_logging.py b/cv_analysis/utils/visual_logging.py index 19bbded..6afbd57 100644 --- a/cv_analysis/utils/visual_logging.py +++ b/cv_analysis/utils/visual_logging.py @@ -2,21 +2,21 @@ import os from cv_analysis.config import CONFIG from cv_analysis.utils.display import save_mpl -LEVEL = CONFIG.visual_logging.level -OUTPUT_FOLDER = CONFIG.visual_logging.output_folder - class VisualLogger: - def __init__(self): - self.level_is_debug = LEVEL == "DEBUG" - self.output_folder = OUTPUT_FOLDER + def __init__(self, level, output_folder): + self.level = level + self.output_folder = output_folder if not os.path.exists(self.output_folder): os.mkdir(self.output_folder) def debug(self, img, name): - if self.level_is_debug: + if self.level_is_debug(): output_path = os.path.join(self.output_folder, name) save_mpl(img, output_path) + + def level_is_debug(self): + return self.level == "DEBUG" -vizlogger = VisualLogger() +vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder) diff --git a/scripts/annotate.py b/scripts/annotate.py index a5d8e20..c92ecf1 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -20,7 +20,7 @@ def parse_args(): if __name__ == "__main__": args = parse_args() - #print(args.show) + # print(args.show) if args.type == "table": annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show) elif args.type == "redaction": diff --git a/scripts/client_mock.py b/scripts/client_mock.py index 96ab9b4..a64fe95 100644 --- a/scripts/client_mock.py +++ b/scripts/client_mock.py @@ -34,7 +34,7 @@ def parse_args(): def main(args): - + operations = args.operations.split(",") for operation in operations: print("****************************") diff --git a/src/run_service.py b/src/run_service.py index 304e8d8..876b96e 100644 --- a/src/run_service.py +++ b/src/run_service.py @@ -87,20 +87,12 @@ def main(): tracemalloc.stop() - def make_annotations(pdf, annotation_function): results = [] for i, page in enumerate(pdf): boxes = annotation_function(page) - cells= [] - if boxes: - cells = [{"x": x, "y": y, "width": w, "height": h} for x,y,w,h in boxes] - results.append({ - "page": i, - "pageWidth": page.shape[1], - "pageHeight": page.shape[0], - "cells": cells - }) + cells = [{"x": x, "y": y, "width": w, "height": h} for x, y, w, h in boxes] + results.append({"page": i, "pageWidth": page.shape[1], "pageHeight": page.shape[0], "cells": cells}) logger.info(str(results)) logger.info(type(results)) output_dict = {"pages": results} @@ -118,7 +110,7 @@ def annotate(annotation_function): logger.info(f"Processing data.") pdf, angles = open_pdf(data) annotations = make_annotations(pdf, annotation_function) - #if CONFIG.deskew.function != "identity": + # if CONFIG.deskew.function != "identity": # annotations.update({"deskew_angles": angles}) return annotations From 41e5f55ea700faca12631c1398875db0459073bb Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Wed, 27 Apr 2022 09:18:57 +0200 Subject: [PATCH 3/5] got changes to table parsing from other branch --- cv_analysis/table_parsing.py | 59 +++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 11e64d2..0a6ceed 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -11,12 +11,15 @@ from cv_analysis.utils.display import show_mpl from cv_analysis.utils.draw import draw_rectangles from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d from cv_analysis.utils.deskew import deskew_histbased +from cv_analysis.utils.filters import is_large_enough from cv_analysis.utils.visual_logging import vizlogger from cv_analysis.layout_parsing import parse_layout -def add_external_contours(image, img): - contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) +def add_external_contours(image, contour_source_image): + contours, _ = cv2.findContours(contour_source_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + contours = filter(partial(is_large_enough, min_area=5000), contours) + for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) @@ -24,6 +27,16 @@ def add_external_contours(image, img): return image +def extend_lines(): + #TODO + pass + + +def make_table_block_mask(): + #TODO + pass + + def apply_motion_blur(image: np.array, angle, size=80): """Solidifies and slightly extends detected lines. @@ -48,7 +61,7 @@ def apply_motion_blur(image: np.array, angle, size=80): return blurred -def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): +def isolate_vertical_and_horizontal_components(img_bin): """Identifies and reinforces horizontal and vertical lines in a binary image. Args: @@ -65,19 +78,20 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) vizlogger.debug(img_bin_h, "tables01_isolate01_img_bin_h.png") img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - vizlogger.debug(img_bin_v, "tables02_isolate02_img_bin_v.png") - + img_lines_raw = img_bin_v | img_bin_h + vizlogger.debug(img_lines_raw, "tables02_isolate02_img_bin_v.png") + kernel_h = np.ones((1, 30), np.uint8) kernel_v = np.ones((30, 1), np.uint8) img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2) vizlogger.debug(img_bin_h, "tables03_isolate03_dilate_h.png") img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2) - vizlogger.debug(img_bin_v, "tables04_isolate04_dilate_v.png") + vizlogger.debug(img_bin_v | img_bin_h, "tables04_isolate04_dilate_v.png") img_bin_h = apply_motion_blur(img_bin_h, 0) vizlogger.debug(img_bin_h, "tables09_isolate05_blur_h.png") img_bin_v = apply_motion_blur(img_bin_v, 90) - vizlogger.debug(img_bin_v, "tables10_isolate06_blur_v.png") + vizlogger.debug(img_bin_v | img_bin_h, "tables10_isolate06_blur_v.png") img_bin_final = img_bin_h | img_bin_v vizlogger.debug(img_bin_final, "tables11_isolate07_final.png") @@ -86,20 +100,14 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): vizlogger.debug(img_bin_final, "tables10_isolate12_threshold.png") img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1) vizlogger.debug(img_bin_final, "tables11_isolate13_dilate.png") - - img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects) - vizlogger.debug(img_bin_final, "tables12_isolate14_disconnect.png") + + # add contours before lines are extended by blurring + img_bin_final = add_external_contours(img_bin_final, img_lines_raw) + vizlogger.debug(img_bin_final, "tables11_isolate14_contours_added.png") return img_bin_final -def disconnect_non_existing_cells(img_bin, bounding_rects): - for rect in bounding_rects: - x, y, w, h = rect - img_bin = cv2.rectangle(img_bin, (x, y), (x + w, y + h), (0, 0, 0), 5) - return img_bin - - def has_table_shape(rects): assert isinstance(rects, list) @@ -156,7 +164,10 @@ def parse_table(image: np.array, show=False): image = preprocess(image) table_layout_boxes = find_table_layout_boxes(image) - image = isolate_vertical_and_horizontal_components(image, table_layout_boxes) + + image = isolate_vertical_and_horizontal_components(image) + #image = add_external_contours(image, image) + #vizlogger.debug(image, "external_contours_added.png") _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S) @@ -177,7 +188,13 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False): stats = parse_table(page) page = draw_rectangles(page, stats, annotate=True) - - if show: - show_mpl(page) vizlogger.debug(page, "tables15_final_output.png") + + +def tables_in_image(cropped_image): + table_rects = parse_table(cropped_image) + + if len(table_rects) > 0: + return True, table_rects + else: + return False, None From 81fe5139c2d823e344535293a16dd45f26d314ba Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Wed, 27 Apr 2022 10:52:35 +0200 Subject: [PATCH 4/5] fixed tests, passed (still need to extend tests) --- cv_analysis/table_parsing.py | 2 +- cv_analysis/test/test_data/table.json | 460 ++++---------------------- src/run_service.py | 7 - 3 files changed, 59 insertions(+), 410 deletions(-) diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 0a6ceed..852df2b 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -174,7 +174,7 @@ def parse_table(image: np.array, show=False): stats = np.vstack(list(filter(is_large_enough, stats))) rects = stats[:, :-1][2:] - return list(rects) + return list(map(list, rects)) def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False): diff --git a/cv_analysis/test/test_data/table.json b/cv_analysis/test/test_data/table.json index 009b24e..5e78d0e 100644 --- a/cv_analysis/test/test_data/table.json +++ b/cv_analysis/test/test_data/table.json @@ -1,406 +1,62 @@ { "0": [ - [ - 211, - 415, - 367, - 29 - ], - [ - 581, - 415, - 417, - 29 - ], - [ - 1001, - 415, - 406, - 29 - ], - [ - 211, - 447, - 367, - 47 - ], - [ - 581, - 447, - 417, - 47 - ], - [ - 1001, - 447, - 406, - 47 - ], - [ - 211, - 497, - 367, - 47 - ], - [ - 580, - 497, - 418, - 47 - ], - [ - 1001, - 497, - 406, - 47 - ], - [ - 211, - 547, - 367, - 47 - ], - [ - 580, - 547, - 418, - 47 - ], - [ - 1001, - 547, - 406, - 47 - ], - [ - 211, - 597, - 367, - 47 - ], - [ - 581, - 597, - 417, - 47 - ], - [ - 1001, - 597, - 406, - 48 - ], - [ - 212, - 647, - 366, - 48 - ], - [ - 581, - 647, - 417, - 48 - ], - [ - 1001, - 647, - 406, - 48 - ], - [ - 581, - 697, - 417, - 47 - ], - [ - 1001, - 697, - 407, - 48 - ], - [ - 212, - 698, - 366, - 47 - ], - [ - 211, - 747, - 367, - 48 - ], - [ - 581, - 747, - 417, - 48 - ], - [ - 1001, - 748, - 407, - 47 - ], - [ - 211, - 798, - 367, - 47 - ], - [ - 581, - 798, - 417, - 47 - ], - [ - 1001, - 798, - 407, - 47 - ], - [ - 212, - 848, - 366, - 47 - ], - [ - 581, - 848, - 417, - 47 - ], - [ - 1001, - 848, - 407, - 48 - ], - [ - 212, - 898, - 366, - 48 - ], - [ - 581, - 898, - 417, - 48 - ], - [ - 1001, - 898, - 407, - 48 - ], - [ - 212, - 949, - 366, - 33 - ], - [ - 581, - 949, - 827, - 33 - ], - [ - 462, - 1163, - 368, - 29 - ], - [ - 833, - 1163, - 404, - 29 - ], - [ - 462, - 1195, - 368, - 48 - ], - [ - 833, - 1195, - 404, - 48 - ], - [ - 462, - 1245, - 368, - 48 - ], - [ - 833, - 1245, - 404, - 47 - ], - [ - 462, - 1296, - 368, - 47 - ], - [ - 833, - 1296, - 404, - 47 - ], - [ - 462, - 1346, - 368, - 47 - ], - [ - 833, - 1346, - 404, - 47 - ], - [ - 462, - 1396, - 368, - 47 - ], - [ - 834, - 1396, - 403, - 47 - ], - [ - 462, - 1446, - 368, - 48 - ], - [ - 833, - 1446, - 404, - 48 - ], - [ - 462, - 1496, - 368, - 48 - ], - [ - 833, - 1496, - 404, - 48 - ], - [ - 462, - 1547, - 368, - 47 - ], - [ - 834, - 1547, - 403, - 47 - ], - [ - 462, - 1597, - 368, - 48 - ], - [ - 834, - 1597, - 403, - 47 - ], - [ - 462, - 1647, - 368, - 48 - ], - [ - 833, - 1647, - 404, - 48 - ], - [ - 462, - 1698, - 368, - 47 - ], - [ - 833, - 1698, - 404, - 47 - ], - [ - 462, - 1748, - 368, - 47 - ], - [ - 834, - 1748, - 403, - 47 - ], - [ - 462, - 1798, - 368, - 47 - ], - [ - 834, - 1798, - 403, - 47 - ], - [ - 462, - 1848, - 368, - 48 - ], - [ - 834, - 1848, - 403, - 48 - ], - [ - 462, - 1899, - 369, - 33 - ], - [ - 832, - 1899, - 405, - 33 - ] + [211, 447, 367, 47], + [581, 447, 417, 47], + [1001, 447, 406, 47], + [211, 497, 367, 47], + [580, 497, 418, 47], + [1001, 497, 406, 47], + [211, 547, 367, 47], + [580, 547, 418, 47], + [1001, 547, 406, 47], + [211, 597, 367, 47], + [581, 597, 417, 47], + [1001, 597, 406, 48], + [212, 647, 366, 48], + [581, 647, 417, 48], + [1001, 647, 406, 48], + [581, 697, 417, 47], + [1001, 697, 407, 48], + [212, 698, 366, 47], + [211, 747, 367, 48], + [581, 747, 417, 48], + [1001, 748, 407, 47], + [211, 798, 367, 47], + [581, 798, 417, 47], + [1001, 798, 407, 47], + [212, 848, 366, 47], + [581, 848, 417, 47], + [1001, 848, 407, 48], + [212, 898, 366, 48], + [581, 898, 417, 48], + [1001, 898, 407, 48], + [462, 1195, 368, 48], + [833, 1195, 404, 48], + [462, 1245, 368, 48], + [833, 1245, 404, 47], + [462, 1296, 368, 47], + [833, 1296, 404, 47], + [462, 1346, 368, 47], + [833, 1346, 404, 47], + [462, 1396, 368, 47], + [834, 1396, 403, 47], + [462, 1446, 368, 48], + [833, 1446, 404, 48], + [462, 1496, 368, 48], + [833, 1496, 404, 48], + [462, 1547, 368, 47], + [834, 1547, 403, 47], + [462, 1597, 368, 48], + [834, 1597, 403, 47], + [462, 1647, 368, 48], + [833, 1647, 404, 48], + [462, 1698, 368, 47], + [833, 1698, 404, 47], + [462, 1748, 368, 47], + [834, 1748, 403, 47], + [462, 1798, 368, 47], + [834, 1798, 403, 47], + [462, 1848, 368, 48], + [834, 1848, 403, 48] ] } \ No newline at end of file diff --git a/src/run_service.py b/src/run_service.py index 876b96e..86454eb 100644 --- a/src/run_service.py +++ b/src/run_service.py @@ -25,7 +25,6 @@ def suppress_user_warnings(): def main(): file_counter = Counter("cv_analysis_file_counter", "count processed files") - # page_counter = Counter("cv_analysis_page_counter", "count pages from processed files") ram_metric = Gauge("cv_analysis_memory_usage", "Memory usage in Mb") def start_monitoring(): @@ -44,7 +43,6 @@ def main(): def get_tables(): start_monitoring() tables = annotate(parse_table) - # page_counter.inc(npages) return tables @app.route("/redactions", methods=["POST"]) @@ -52,7 +50,6 @@ def main(): def get_redactions(): start_monitoring() redactions = annotate(find_redactions) - # page_counter.inc(npages) return redactions @app.route("/figures", methods=["POST"]) @@ -60,7 +57,6 @@ def main(): def get_figures(): start_monitoring() figures = annotate(detect_figures) - # page_counter.inc(npages) return figures @app.route("/layout", methods=["POST"]) @@ -68,7 +64,6 @@ def main(): def get_layout(): start_monitoring() layout = annotate(parse_layout) - # page_counter.inc(npages) return layout @app.route("/status", methods=["GET"]) @@ -93,8 +88,6 @@ def make_annotations(pdf, annotation_function): boxes = annotation_function(page) cells = [{"x": x, "y": y, "width": w, "height": h} for x, y, w, h in boxes] results.append({"page": i, "pageWidth": page.shape[1], "pageHeight": page.shape[0], "cells": cells}) - logger.info(str(results)) - logger.info(type(results)) output_dict = {"pages": results} return jsonify(json.dumps(output_dict, default=npconvert)) From 21d1f087c84d4eb61e8ac77a691cdc19fd67617c Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Wed, 27 Apr 2022 11:27:38 +0200 Subject: [PATCH 5/5] fixed show parameter, for development only --- cv_analysis/table_parsing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 852df2b..5d4b522 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -189,6 +189,8 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False): stats = parse_table(page) page = draw_rectangles(page, stats, annotate=True) vizlogger.debug(page, "tables15_final_output.png") + if show: + show_mpl(page) def tables_in_image(cropped_image):