diff --git a/cv_analysis/fig_detection_with_layout.py b/cv_analysis/fig_detection_with_layout.py index bd84789..921c256 100644 --- a/cv_analysis/fig_detection_with_layout.py +++ b/cv_analysis/fig_detection_with_layout.py @@ -1,41 +1,41 @@ from cv_analysis.layout_parsing import annotate_layout_in_pdf -from cv_analysis.figure_detection import figures_in_image, detect_figures -from cv_analysis.table_parsing import tables_in_image +from cv_analysis.figure_detection import detect_figures +from cv_analysis.table_parsing import tables_in_image, parse_table from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions from cv_analysis.utils.draw import draw_rectangles from cv_analysis.utils.display import show_mpl -def detect_parting_line(image): - pass - - def cut_out_content_structures(layout_rects, page): - large_enough_rects = [] - too_small_rects = [] + large_rects = [] + small_rects = [] for x, y, w, h in layout_rects: rect = (x, y, w, h) - if w * h >= 100000: + if w * h >= 50000: cropped_page = page[y:(y + h), x:(x + w)] - large_enough_rects.append([rect, cropped_page]) + large_rects.append([rect, cropped_page]) else: cropped_page = page[y:(y + h), x:(x + w)] - too_small_rects.append([rect, cropped_page]) - return large_enough_rects, too_small_rects + small_rects.append([rect, cropped_page]) + return large_rects, small_rects -def parse_and_label_content_structures(page, large_enough_rects, too_small_rects): - for coordinates, cropped_image in large_enough_rects: +def parse_content_structures(page, large_rects, small_rects): + for coordinates, cropped_image in large_rects: non_text_rects = detect_figures(cropped_image) - print(len(non_text_rects), len(list(non_text_rects))) + if len(non_text_rects) == 0: page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True) + elif tables_in_image(cropped_image)[0]: page = draw_rectangles(page, [coordinates], color=(255, 0, 0), annotate=True) + stats = parse_table(page) + page = draw_rectangles(page, stats, annotate=True) + else: page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True) - # for coordinates, cropped_image in too_small_rects: + # for coordinates, cropped_image in small_rects: # non_text_rects = detect_figures(cropped_image) # if len(non_text_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0: # page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True) @@ -44,16 +44,13 @@ def parse_and_label_content_structures(page, large_enough_rects, too_small_rects return page -def detect_figures_over_layout(): - # pdf_path = "/home/lillian/PycharmProjects/ner_address/data/pdfs/syngenta/026c917f04660aaea4bb57d180f9598b.pdf" - # pdf_path = "/home/lillian/ocr_docs/ocr1.pdf" - pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf" - #pdf_path = "/home/lillian/ocr_docs/VV-857853.pdf" - page_index = 13 +def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False): layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True) big_structures, small_structures = cut_out_content_structures(layout_rects, page) - page = parse_and_label_content_structures(page, big_structures, small_structures) - show_mpl(page) + page = parse_content_structures(page, big_structures, small_structures) + if show: + show_mpl(page) + else: + return page -detect_figures_over_layout() diff --git a/cv_analysis/figure_detection.py b/cv_analysis/figure_detection.py index 4c44a57..ab43883 100644 --- a/cv_analysis/figure_detection.py +++ b/cv_analysis/figure_detection.py @@ -1,6 +1,7 @@ import cv2 import numpy as np from pdf2image import pdf2image +from PIL import Image from cv_analysis.utils.detection import detect_large_coherent_structures from cv_analysis.utils.display import show_mpl @@ -16,9 +17,7 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): def detect_figures(image: np.array): image = image.copy() - #show_mpl(image) image = remove_primary_text_regions(image) - #show_mpl(image) cnts = detect_large_coherent_structures(image) cnts = filter(is_likely_figure, cnts) @@ -41,10 +40,18 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False): return page -def figures_in_image(cropped_page): - redaction_contours = detect_figures(cropped_page) - if len(redaction_contours) > 0: - return True - else: - return False +# pages = [] +# for i in range(0,16): +# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf" +# page_index = i +# page = detect_figures_in_pdf(pdf_path, page_index, show=False) +# pages.append(Image.fromarray(page)) +# p1, p = pages[0], pages[1:] +# +# out_pdf_path = "/home/lillian/ocr_docs/out.pdf" +# +# p1.save( +# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p +# ) + diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 83dd2e5..f6b3286 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -156,10 +156,11 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False): else: return page + def tables_in_image(cropped_image): table_rects = parse_table(cropped_image) - if len(table_rects)>0: + if len(table_rects) > 0: return True, table_rects else: return False, None diff --git a/cv_analysis/utils/text.py b/cv_analysis/utils/text.py index 31f3d2c..6161db2 100644 --- a/cv_analysis/utils/text.py +++ b/cv_analysis/utils/text.py @@ -1,4 +1,6 @@ import cv2 +import numpy as np + from cv_analysis.utils.display import show_mpl def remove_primary_text_regions(image): @@ -14,12 +16,9 @@ def remove_primary_text_regions(image): image = image.copy() cnts = find_primary_text_regions(image) - for cnt in cnts: x, y, w, h = cv2.boundingRect(cnt) - print(x,y,w,h, w*h, w/h) cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) - return image @@ -38,7 +37,6 @@ def find_primary_text_regions(image): def is_likely_primary_text_segments(cnt): x,y,w,h = cv2.boundingRect(cnt) - print(cv2.contourArea(cnt)) return 800 < cv2.contourArea(cnt) < 16000 or w/h > 3 image = image.copy() @@ -48,14 +46,17 @@ def find_primary_text_regions(image): image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3 close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1) - show_mpl(close) - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 5)) + + #show_mpl(close) + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(7, 4)) #5,3 dilate = cv2.dilate(close, dilate_kernel, iterations=1) - show_mpl(dilate) + + #show_mpl(dilate) + cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnts = filter(is_likely_primary_text_segments, cnts) return cnts - diff --git a/scripts/annotate.py b/scripts/annotate.py index 03ab3db..e9eae7d 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -4,13 +4,14 @@ from cv_analysis.table_parsing import annotate_tables_in_pdf from cv_analysis.redaction_detection import annotate_redactions_in_pdf from cv_analysis.layout_parsing import annotate_layout_in_pdf from cv_analysis.figure_detection import detect_figures_in_pdf +from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("pdf_path") parser.add_argument("page_index", type=int) - parser.add_argument("--type", choices=["table", "redaction", "layout", "figure"]) + parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"]) args = parser.parse_args() @@ -27,3 +28,5 @@ if __name__ == "__main__": annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=True) elif args.type == "figure": detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True) + elif args.type == "figure2": + detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=True)