from cv_analysis.layout_parsing import annotate_layout_in_pdf from cv_analysis.figure_detection import detect_figures from cv_analysis.table_parsing import tables_in_image, parse_table from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions from cv_analysis.utils.draw import draw_rectangles from cv_analysis.utils.display import show_mpl from cv_analysis.utils.visual_logging import vizlogger from PIL import Image def cut_out_content_structures(layout_rects, page): large_rects = [] small_rects = [] for x, y, w, h in layout_rects: rect = (x, y, w, h) if w * h >= 75000: cropped_page = page[y:(y + h), x:(x + w)] large_rects.append([rect, cropped_page]) else: cropped_page = page[y:(y + h), x:(x + w)] small_rects.append([rect, cropped_page]) return large_rects, small_rects def parse_content_structures(page, large_rects, small_rects): for coordinates, cropped_image in large_rects: figure_rects = detect_figures(cropped_image) if len(figure_rects) == 0: # text page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True) elif tables_in_image(cropped_image)[0]: # table stats = parse_table(page) page = draw_rectangles(page, stats, color=(255, 0, 0), annotate=True) else: # figure page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True) # for coordinates, cropped_image in small_rects: # figure_rects = detect_figures(cropped_image) # if len(figure_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0: # page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True) # else: # page = draw_rectangles(page, [coordinates], color=(0, 255, 255), annotate=True) return page def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False): layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True) big_structures, small_structures = cut_out_content_structures(layout_rects, page) page = parse_content_structures(page, big_structures, small_structures) vizlogger.debug(page, "figures03_final.png") if show: show_mpl(page) else: return page # pages = [] # for i in range(0,16): # pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf" # page_index = i # layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True) # big_structures, small_structures = cut_out_content_structures(layout_rects, page) # page = parse_content_structures(page, big_structures, small_structures) # pages.append(Image.fromarray(page)) # p1, p = pages[0], pages[1:] # # out_pdf_path = "/home/lillian/ocr_docs/out1.pdf" # # p1.save( # out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p # )