added pyinfra_compat file, usage: from cv_analysis.pyinfra_compat import analyze_byteslist; page_results = analyze_byteslist(img_bytes_list)

This commit is contained in:
Isaac Riley 2022-06-14 09:09:00 +02:00
parent 0d9d577187
commit b66a7f15e1
7 changed files with 44 additions and 391 deletions

View File

@ -0,0 +1,32 @@
from cv_analysis.table_parsing import parse_tables
from cv_analysis.redaction_detection import find_redactions
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.figure_detection import detect_figures
from cv_analysis.utils.preprocessing import open_img_from_bytes
task_dict = {
"table": parse_tables,
"figure": detect_figures,
"layout": parse_layout,
"redaction": find_redactions,
}
def analyze_byteslist(img_bytes_list, task="table"):
analysis_function = task_dict[task]
result = []
for i, img_bytes in enumerate(img_bytes_list):
page = open_img_from_bytes(img_bytes)
cells = list(map(lambda x: x.json_xywh(), analysis_function(page)))
page_dict = {
"page": i,
"pageWidth": page.shape[1],
"pageHeight": page.shape[0],
"cells": cells
}
result.append(page_dict)
return result

View File

@ -172,21 +172,4 @@ def parse_tables(image: np.array, show=False):
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
# print(rects)
return list(map(Rectangle.from_xywh, rects))
# def annotate_tables_in_pdf(page, page_index=0, deskew=False, show=False):
# """ """
# #page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
# #page = np.array(page)
# if show:
# show_mpl(page)
# if deskew:
# page, _ = deskew_histbased(page)
# stats = parse_tables(page)
# page = draw_rectangles(page, stats, annotate=True)
# vizlogger.debug(page, "tables15_final_output.png")
# if show:
# show_mpl(page)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 215 KiB

View File

@ -1,359 +0,0 @@
{
"pages": [
{
"page": 0,
"pageWidth": 2346,
"pageHeight": 1663,
"cells": [
{
"x": 211,
"y": 447,
"width": 367,
"height": 47
},
{
"x": 581,
"y": 447,
"width": 417,
"height": 47
},
{
"x": 1001,
"y": 447,
"width": 406,
"height": 47
},
{
"x": 211,
"y": 497,
"width": 367,
"height": 47
},
{
"x": 580,
"y": 497,
"width": 418,
"height": 47
},
{
"x": 1001,
"y": 497,
"width": 406,
"height": 47
},
{
"x": 211,
"y": 547,
"width": 367,
"height": 47
},
{
"x": 580,
"y": 547,
"width": 418,
"height": 47
},
{
"x": 1001,
"y": 547,
"width": 406,
"height": 47
},
{
"x": 211,
"y": 597,
"width": 367,
"height": 47
},
{
"x": 581,
"y": 597,
"width": 417,
"height": 47
},
{
"x": 1001,
"y": 597,
"width": 406,
"height": 48
},
{
"x": 212,
"y": 647,
"width": 366,
"height": 48
},
{
"x": 581,
"y": 647,
"width": 417,
"height": 48
},
{
"x": 1001,
"y": 647,
"width": 406,
"height": 48
},
{
"x": 581,
"y": 697,
"width": 417,
"height": 47
},
{
"x": 1001,
"y": 697,
"width": 407,
"height": 48
},
{
"x": 212,
"y": 698,
"width": 366,
"height": 47
},
{
"x": 211,
"y": 747,
"width": 367,
"height": 48
},
{
"x": 581,
"y": 747,
"width": 417,
"height": 48
},
{
"x": 1001,
"y": 748,
"width": 407,
"height": 47
},
{
"x": 211,
"y": 798,
"width": 367,
"height": 47
},
{
"x": 581,
"y": 798,
"width": 417,
"height": 47
},
{
"x": 1001,
"y": 798,
"width": 407,
"height": 47
},
{
"x": 212,
"y": 848,
"width": 366,
"height": 47
},
{
"x": 581,
"y": 848,
"width": 417,
"height": 47
},
{
"x": 1001,
"y": 848,
"width": 407,
"height": 48
},
{
"x": 212,
"y": 898,
"width": 366,
"height": 48
},
{
"x": 581,
"y": 898,
"width": 417,
"height": 48
},
{
"x": 1001,
"y": 898,
"width": 407,
"height": 48
},
{
"x": 462,
"y": 1195,
"width": 368,
"height": 48
},
{
"x": 833,
"y": 1195,
"width": 404,
"height": 48
},
{
"x": 462,
"y": 1245,
"width": 368,
"height": 48
},
{
"x": 833,
"y": 1245,
"width": 404,
"height": 47
},
{
"x": 462,
"y": 1296,
"width": 368,
"height": 47
},
{
"x": 833,
"y": 1296,
"width": 404,
"height": 47
},
{
"x": 462,
"y": 1346,
"width": 368,
"height": 47
},
{
"x": 833,
"y": 1346,
"width": 404,
"height": 47
},
{
"x": 462,
"y": 1396,
"width": 368,
"height": 47
},
{
"x": 834,
"y": 1396,
"width": 403,
"height": 47
},
{
"x": 462,
"y": 1446,
"width": 368,
"height": 48
},
{
"x": 833,
"y": 1446,
"width": 404,
"height": 48
},
{
"x": 462,
"y": 1496,
"width": 368,
"height": 48
},
{
"x": 833,
"y": 1496,
"width": 404,
"height": 48
},
{
"x": 462,
"y": 1547,
"width": 368,
"height": 47
},
{
"x": 834,
"y": 1547,
"width": 403,
"height": 47
},
{
"x": 462,
"y": 1597,
"width": 368,
"height": 48
},
{
"x": 834,
"y": 1597,
"width": 403,
"height": 47
},
{
"x": 462,
"y": 1647,
"width": 368,
"height": 48
},
{
"x": 833,
"y": 1647,
"width": 404,
"height": 48
},
{
"x": 462,
"y": 1698,
"width": 368,
"height": 47
},
{
"x": 833,
"y": 1698,
"width": 404,
"height": 47
},
{
"x": 462,
"y": 1748,
"width": 368,
"height": 47
},
{
"x": 834,
"y": 1748,
"width": 403,
"height": 47
},
{
"x": 462,
"y": 1798,
"width": 368,
"height": 47
},
{
"x": 834,
"y": 1798,
"width": 403,
"height": 47
},
{
"x": 462,
"y": 1848,
"width": 368,
"height": 48
},
{
"x": 834,
"y": 1848,
"width": 403,
"height": 48
}
]
}
]
}

View File

@ -1,3 +1,4 @@
from io import BytesIO
from numpy import array, ndarray
import pdf2image
from PIL import Image
@ -29,3 +30,8 @@ def open_pdf(pdf, first_page=0, last_page=None):
pages = [preprocess_pdf_image(array(p)) for p in pages]
return pages
def open_img_from_bytes(bytes_obj: bytes):
page = Image.open(BytesIO(bytes_obj))
return preprocess_pdf_image(array(page))

View File

@ -4,12 +4,12 @@ from json import dumps
class Rectangle:
def __init__(self, x1=None, y1=None, w=None, h=None, x2=None, y2=None, indent=4, format="xywh"):
try:
self.x1 = x1
self.y1 = y1
self.w = w if w else x2 - x1
self.h = h if h else y2 - y1
self.x2 = x2 if x2 else x1 + w
self.y2 = y2 if y2 else y1 + h
self.x1 = int(x1)
self.y1 = int(y1)
self.w = int(w) if w else int(x2 - x1)
self.h = int(h) if h else int(y2 - y1)
self.x2 = int(x2) if x2 else self.x1 + self.w
self.y2 = int(y2) if y2 else self.y1 + self.h
assert (self.x1 + self.w) == self.x2
assert (self.y1 + self.h) == self.y2
self.indent = indent
@ -56,14 +56,6 @@ class Rectangle:
return list(self.json().values()).__iter__()
"""
boxes = [[30,40,5,6],[56,78,23,19],[5,100,45,35],[34,34,67,67]]
rectangles = list(map(Rectangle.from_xywh, boxes))
rectangles
r = rectangles[1]
"""
class Contour:
def __init__(self):
pass

View File

@ -13,7 +13,6 @@ from cv_analysis.redaction_detection import find_redactions
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.figure_detection import detect_figures
from cv_analysis.utils.logging import logger
from cv_analysis.utils.post_processing import Rectangle
from cv_analysis.utils.preprocessing import open_pdf
from cv_analysis.utils.structures import Rectangle
from cv_analysis.config import CONFIG