From 3669b6b341361738437aecc46835ee73ac9b53f1 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 20 Apr 2022 09:43:30 +0200 Subject: [PATCH] fig_detection_with_layout.py: approach to label the content of a page through layout detection, table parsing for detected tables needs to be added and overall codes needs to be reviewed layout_parsing.py added condition so fig_detection_with_layout.py works table_parsing.py uncommented line for better table parsing text.py changed kernel sizes --- cv_analysis/fig_detection_with_layout.py | 59 ++++++++++++++++++++++++ cv_analysis/figure_detection.py | 15 ++++-- cv_analysis/layout_parsing.py | 9 ++-- cv_analysis/table_parsing.py | 11 ++++- cv_analysis/utils/text.py | 16 ++++--- 5 files changed, 96 insertions(+), 14 deletions(-) create mode 100644 cv_analysis/fig_detection_with_layout.py diff --git a/cv_analysis/fig_detection_with_layout.py b/cv_analysis/fig_detection_with_layout.py new file mode 100644 index 0000000..bd84789 --- /dev/null +++ b/cv_analysis/fig_detection_with_layout.py @@ -0,0 +1,59 @@ +from cv_analysis.layout_parsing import annotate_layout_in_pdf +from cv_analysis.figure_detection import figures_in_image, detect_figures +from cv_analysis.table_parsing import tables_in_image +from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions +from cv_analysis.utils.draw import draw_rectangles +from cv_analysis.utils.display import show_mpl + + +def detect_parting_line(image): + pass + + +def cut_out_content_structures(layout_rects, page): + large_enough_rects = [] + too_small_rects = [] + for x, y, w, h in layout_rects: + rect = (x, y, w, h) + if w * h >= 100000: + cropped_page = page[y:(y + h), x:(x + w)] + large_enough_rects.append([rect, cropped_page]) + else: + cropped_page = page[y:(y + h), x:(x + w)] + too_small_rects.append([rect, cropped_page]) + return large_enough_rects, too_small_rects + + +def parse_and_label_content_structures(page, large_enough_rects, too_small_rects): + for coordinates, cropped_image in large_enough_rects: + non_text_rects = detect_figures(cropped_image) + print(len(non_text_rects), len(list(non_text_rects))) + if len(non_text_rects) == 0: + page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True) + elif tables_in_image(cropped_image)[0]: + page = draw_rectangles(page, [coordinates], color=(255, 0, 0), annotate=True) + else: + page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True) + + # for coordinates, cropped_image in too_small_rects: + # non_text_rects = detect_figures(cropped_image) + # if len(non_text_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0: + # page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True) + # else: + # page = draw_rectangles(page, [coordinates], color=(0, 255, 255), annotate=True) + return page + + +def detect_figures_over_layout(): + # pdf_path = "/home/lillian/PycharmProjects/ner_address/data/pdfs/syngenta/026c917f04660aaea4bb57d180f9598b.pdf" + # pdf_path = "/home/lillian/ocr_docs/ocr1.pdf" + pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf" + #pdf_path = "/home/lillian/ocr_docs/VV-857853.pdf" + page_index = 13 + layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True) + big_structures, small_structures = cut_out_content_structures(layout_rects, page) + page = parse_and_label_content_structures(page, big_structures, small_structures) + show_mpl(page) + + +detect_figures_over_layout() diff --git a/cv_analysis/figure_detection.py b/cv_analysis/figure_detection.py index 3827536..4c44a57 100644 --- a/cv_analysis/figure_detection.py +++ b/cv_analysis/figure_detection.py @@ -15,11 +15,10 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): def detect_figures(image: np.array): - image = image.copy() - + #show_mpl(image) image = remove_primary_text_regions(image) - show_mpl(image) + #show_mpl(image) cnts = detect_large_coherent_structures(image) cnts = filter(is_likely_figure, cnts) @@ -30,7 +29,6 @@ def detect_figures(image: np.array): def detect_figures_in_pdf(pdf_path, page_index=1, show=False): - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) @@ -41,3 +39,12 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False): show_mpl(page) else: return page + + +def figures_in_image(cropped_page): + redaction_contours = detect_figures(cropped_page) + + if len(redaction_contours) > 0: + return True + else: + return False diff --git a/cv_analysis/layout_parsing.py b/cv_analysis/layout_parsing.py index ae5559f..19d56a1 100644 --- a/cv_analysis/layout_parsing.py +++ b/cv_analysis/layout_parsing.py @@ -63,15 +63,18 @@ def parse_layout(image: np.array): return list(rects) -def annotate_layout_in_pdf(pdf_path, page_index=1, show=False): +def annotate_layout_in_pdf(pdf_path, page_index=1, return_rects=False, show=False): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) rects = parse_layout(page) - page = draw_rectangles(page, rects) - if show: + if return_rects: + return rects, page + elif show: + page = draw_rectangles(page, rects) show_mpl(page) else: + page = draw_rectangles(page, rects) return page diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 404b7ed..83dd2e5 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -10,13 +10,14 @@ from cv_analysis.utils.display import show_mpl from cv_analysis.utils.draw import draw_rectangles from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated from cv_analysis.utils.deskew import deskew_histbased +from cv_analysis.utils.filters import is_large_enough from cv_analysis.layout_parsing import parse_layout def add_external_contours(image, img): contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - # contours = filter(partial(is_large_enough, min_area=5000000), contours) + contours = filter(partial(is_large_enough, min_area=5000000), contours) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) @@ -154,3 +155,11 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False): show_mpl(page) else: return page + +def tables_in_image(cropped_image): + table_rects = parse_table(cropped_image) + + if len(table_rects)>0: + return True, table_rects + else: + return False, None diff --git a/cv_analysis/utils/text.py b/cv_analysis/utils/text.py index 7ce6d7f..31f3d2c 100644 --- a/cv_analysis/utils/text.py +++ b/cv_analysis/utils/text.py @@ -1,5 +1,5 @@ import cv2 - +from cv_analysis.utils.display import show_mpl def remove_primary_text_regions(image): """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. @@ -17,6 +17,7 @@ def remove_primary_text_regions(image): for cnt in cnts: x, y, w, h = cv2.boundingRect(cnt) + print(x,y,w,h, w*h, w/h) cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) return image @@ -36,7 +37,9 @@ def find_primary_text_regions(image): """ def is_likely_primary_text_segments(cnt): - return 700 < cv2.contourArea(cnt) < 16000 + x,y,w,h = cv2.boundingRect(cnt) + print(cv2.contourArea(cnt)) + return 800 < cv2.contourArea(cnt) < 16000 or w/h > 3 image = image.copy() @@ -45,13 +48,14 @@ def find_primary_text_regions(image): image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1) - # show_mpl(close) - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) + show_mpl(close) + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 5)) dilate = cv2.dilate(close, dilate_kernel, iterations=1) - # show_mpl(dilate) + show_mpl(dilate) cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnts = filter(is_likely_primary_text_segments, cnts) return cnts +