diff --git a/cv_analysis/pyinfra_compat b/cv_analysis/pyinfra_compat new file mode 100644 index 0000000..4c346f2 --- /dev/null +++ b/cv_analysis/pyinfra_compat @@ -0,0 +1,32 @@ +from cv_analysis.table_parsing import parse_tables +from cv_analysis.redaction_detection import find_redactions +from cv_analysis.layout_parsing import parse_layout +from cv_analysis.figure_detection import detect_figures +from cv_analysis.utils.preprocessing import open_img_from_bytes + + +task_dict = { + "table": parse_tables, + "figure": detect_figures, + "layout": parse_layout, + "redaction": find_redactions, +} + + +def analyze_byteslist(img_bytes_list, task="table"): + + analysis_function = task_dict[task] + + result = [] + for i, img_bytes in enumerate(img_bytes_list): + page = open_img_from_bytes(img_bytes) + cells = list(map(lambda x: x.json_xywh(), analysis_function(page))) + page_dict = { + "page": i, + "pageWidth": page.shape[1], + "pageHeight": page.shape[0], + "cells": cells + } + result.append(page_dict) + + return result \ No newline at end of file diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 43716da..78ede82 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -172,21 +172,4 @@ def parse_tables(image: np.array, show=False): stats = np.vstack(list(filter(is_large_enough, stats))) rects = stats[:, :-1][2:] - # print(rects) return list(map(Rectangle.from_xywh, rects)) - - -# def annotate_tables_in_pdf(page, page_index=0, deskew=False, show=False): -# """ """ -# #page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] -# #page = np.array(page) -# if show: -# show_mpl(page) -# if deskew: -# page, _ = deskew_histbased(page) - -# stats = parse_tables(page) -# page = draw_rectangles(page, stats, annotate=True) -# vizlogger.debug(page, "tables15_final_output.png") -# if show: -# show_mpl(page) diff --git a/cv_analysis/test/test_data/table.jpg b/cv_analysis/test/test_data/table.jpg deleted file mode 100644 index 2b47101..0000000 Binary files a/cv_analysis/test/test_data/table.jpg and /dev/null differ diff --git a/cv_analysis/test/test_data/table.json b/cv_analysis/test/test_data/table.json deleted file mode 100644 index 44e82ea..0000000 --- a/cv_analysis/test/test_data/table.json +++ /dev/null @@ -1,359 +0,0 @@ -{ - "pages": [ - { - "page": 0, - "pageWidth": 2346, - "pageHeight": 1663, - "cells": [ - { - "x": 211, - "y": 447, - "width": 367, - "height": 47 - }, - { - "x": 581, - "y": 447, - "width": 417, - "height": 47 - }, - { - "x": 1001, - "y": 447, - "width": 406, - "height": 47 - }, - { - "x": 211, - "y": 497, - "width": 367, - "height": 47 - }, - { - "x": 580, - "y": 497, - "width": 418, - "height": 47 - }, - { - "x": 1001, - "y": 497, - "width": 406, - "height": 47 - }, - { - "x": 211, - "y": 547, - "width": 367, - "height": 47 - }, - { - "x": 580, - "y": 547, - "width": 418, - "height": 47 - }, - { - "x": 1001, - "y": 547, - "width": 406, - "height": 47 - }, - { - "x": 211, - "y": 597, - "width": 367, - "height": 47 - }, - { - "x": 581, - "y": 597, - "width": 417, - "height": 47 - }, - { - "x": 1001, - "y": 597, - "width": 406, - "height": 48 - }, - { - "x": 212, - "y": 647, - "width": 366, - "height": 48 - }, - { - "x": 581, - "y": 647, - "width": 417, - "height": 48 - }, - { - "x": 1001, - "y": 647, - "width": 406, - "height": 48 - }, - { - "x": 581, - "y": 697, - "width": 417, - "height": 47 - }, - { - "x": 1001, - "y": 697, - "width": 407, - "height": 48 - }, - { - "x": 212, - "y": 698, - "width": 366, - "height": 47 - }, - { - "x": 211, - "y": 747, - "width": 367, - "height": 48 - }, - { - "x": 581, - "y": 747, - "width": 417, - "height": 48 - }, - { - "x": 1001, - "y": 748, - "width": 407, - "height": 47 - }, - { - "x": 211, - "y": 798, - "width": 367, - "height": 47 - }, - { - "x": 581, - "y": 798, - "width": 417, - "height": 47 - }, - { - "x": 1001, - "y": 798, - "width": 407, - "height": 47 - }, - { - "x": 212, - "y": 848, - "width": 366, - "height": 47 - }, - { - "x": 581, - "y": 848, - "width": 417, - "height": 47 - }, - { - "x": 1001, - "y": 848, - "width": 407, - "height": 48 - }, - { - "x": 212, - "y": 898, - "width": 366, - "height": 48 - }, - { - "x": 581, - "y": 898, - "width": 417, - "height": 48 - }, - { - "x": 1001, - "y": 898, - "width": 407, - "height": 48 - }, - { - "x": 462, - "y": 1195, - "width": 368, - "height": 48 - }, - { - "x": 833, - "y": 1195, - "width": 404, - "height": 48 - }, - { - "x": 462, - "y": 1245, - "width": 368, - "height": 48 - }, - { - "x": 833, - "y": 1245, - "width": 404, - "height": 47 - }, - { - "x": 462, - "y": 1296, - "width": 368, - "height": 47 - }, - { - "x": 833, - "y": 1296, - "width": 404, - "height": 47 - }, - { - "x": 462, - "y": 1346, - "width": 368, - "height": 47 - }, - { - "x": 833, - "y": 1346, - "width": 404, - "height": 47 - }, - { - "x": 462, - "y": 1396, - "width": 368, - "height": 47 - }, - { - "x": 834, - "y": 1396, - "width": 403, - "height": 47 - }, - { - "x": 462, - "y": 1446, - "width": 368, - "height": 48 - }, - { - "x": 833, - "y": 1446, - "width": 404, - "height": 48 - }, - { - "x": 462, - "y": 1496, - "width": 368, - "height": 48 - }, - { - "x": 833, - "y": 1496, - "width": 404, - "height": 48 - }, - { - "x": 462, - "y": 1547, - "width": 368, - "height": 47 - }, - { - "x": 834, - "y": 1547, - "width": 403, - "height": 47 - }, - { - "x": 462, - "y": 1597, - "width": 368, - "height": 48 - }, - { - "x": 834, - "y": 1597, - "width": 403, - "height": 47 - }, - { - "x": 462, - "y": 1647, - "width": 368, - "height": 48 - }, - { - "x": 833, - "y": 1647, - "width": 404, - "height": 48 - }, - { - "x": 462, - "y": 1698, - "width": 368, - "height": 47 - }, - { - "x": 833, - "y": 1698, - "width": 404, - "height": 47 - }, - { - "x": 462, - "y": 1748, - "width": 368, - "height": 47 - }, - { - "x": 834, - "y": 1748, - "width": 403, - "height": 47 - }, - { - "x": 462, - "y": 1798, - "width": 368, - "height": 47 - }, - { - "x": 834, - "y": 1798, - "width": 403, - "height": 47 - }, - { - "x": 462, - "y": 1848, - "width": 368, - "height": 48 - }, - { - "x": 834, - "y": 1848, - "width": 403, - "height": 48 - } - ] - } - ] -} \ No newline at end of file diff --git a/cv_analysis/utils/preprocessing.py b/cv_analysis/utils/preprocessing.py index ed282ba..011d54e 100644 --- a/cv_analysis/utils/preprocessing.py +++ b/cv_analysis/utils/preprocessing.py @@ -1,3 +1,4 @@ +from io import BytesIO from numpy import array, ndarray import pdf2image from PIL import Image @@ -29,3 +30,8 @@ def open_pdf(pdf, first_page=0, last_page=None): pages = [preprocess_pdf_image(array(p)) for p in pages] return pages + + +def open_img_from_bytes(bytes_obj: bytes): + page = Image.open(BytesIO(bytes_obj)) + return preprocess_pdf_image(array(page)) diff --git a/cv_analysis/utils/structures.py b/cv_analysis/utils/structures.py index 6702080..f6c3c6e 100644 --- a/cv_analysis/utils/structures.py +++ b/cv_analysis/utils/structures.py @@ -4,12 +4,12 @@ from json import dumps class Rectangle: def __init__(self, x1=None, y1=None, w=None, h=None, x2=None, y2=None, indent=4, format="xywh"): try: - self.x1 = x1 - self.y1 = y1 - self.w = w if w else x2 - x1 - self.h = h if h else y2 - y1 - self.x2 = x2 if x2 else x1 + w - self.y2 = y2 if y2 else y1 + h + self.x1 = int(x1) + self.y1 = int(y1) + self.w = int(w) if w else int(x2 - x1) + self.h = int(h) if h else int(y2 - y1) + self.x2 = int(x2) if x2 else self.x1 + self.w + self.y2 = int(y2) if y2 else self.y1 + self.h assert (self.x1 + self.w) == self.x2 assert (self.y1 + self.h) == self.y2 self.indent = indent @@ -56,14 +56,6 @@ class Rectangle: return list(self.json().values()).__iter__() -""" -boxes = [[30,40,5,6],[56,78,23,19],[5,100,45,35],[34,34,67,67]] -rectangles = list(map(Rectangle.from_xywh, boxes)) -rectangles -r = rectangles[1] -""" - - class Contour: def __init__(self): pass diff --git a/src/run_service.py b/src/run_service.py index f063ed1..a86664e 100644 --- a/src/run_service.py +++ b/src/run_service.py @@ -13,7 +13,6 @@ from cv_analysis.redaction_detection import find_redactions from cv_analysis.layout_parsing import parse_layout from cv_analysis.figure_detection import detect_figures from cv_analysis.utils.logging import logger -from cv_analysis.utils.post_processing import Rectangle from cv_analysis.utils.preprocessing import open_pdf from cv_analysis.utils.structures import Rectangle from cv_analysis.config import CONFIG