import os from os import path import pandas as pd from pdf2image import convert_from_path from itertools import chain import json from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files def read_json(path): with open(path, encoding='utf-8') as file: data = json.load(file) return data # def collect_metadata(example_pages, save=False): # metadata = [] # i = 0 # for name, document_sections in example_pages.items(): # for pages in document_sections: # span = list(range(pages[0], pages[1] + 1)) # for page_nr in span: # metadata.append(["fig_table" + str(i), name, page_nr]) # i += 1 # if save: # df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) # df.to_csv("/exported_files/test_pages.csv") # else: # return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) def collect_metadata(example_pages, save=False): metadata = [] make_metadata_entry = make_metadata_entry_maker() for name, document_sections in example_pages.items(): metadata.append(f(name, document_sections, make_metadata_entry)) metadata = list(chain.from_iterable(metadata)) if save: df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv")) else: return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) def f(name, document_sections, make_metadata_entry): for pages in document_sections: span = list(range(pages[0], pages[1] + 1)) for page_nr in span: yield make_metadata_entry(name, page_nr) def make_metadata_entry_maker(): i = -1 def make_metadata_entry(name, page_nr): nonlocal i i += 1 return [f"fig_table{i:0>3}", name, page_nr] return make_metadata_entry def split_pdf(example_pages): dir_path = PDF_FOR_TESTING i = 0 for name, document_sections in example_pages.items(): for pages in document_sections: images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]) for image in images: fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png") image.save(fp=fp, dpi=(300, 300)) i += 1 def rename_files_with_hash(example_pages, hashes): files_to_rename = list(example_pages.keys()) folder = HASHED_PDFS_FOR_TESTING # Iterate through the folder for file in os.listdir(folder): # Checking if the file is present in the list if file in files_to_rename: # construct current name using file name and path old_name = path.join(folder, file) # get file name without extension only_name = path.splitext(file)[0] # Adding the new name with extension new_base = only_name + '_new' + '.txt' # construct full file path new_name = path.join(folder, new_base) # Renaming the file os.rename(old_name, new_name) # verify the result res = os.listdir(folder) print(res) def hash_pdfs(example_pages): pdf_paths = list(path.join(PDF_FOR_TESTING, pdf_name) for pdf_name in example_pages.keys()) hashes = hash_pdf_files(paths=pdf_paths, verbose=0) example_pages = dict(zip(hashes, example_pages.values())) return example_pages def main(): examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json")) # examples_pages = hash_pdfs(examples_pages) collect_metadata(examples_pages, save=True) split_pdf(examples_pages) if __name__ == "__main__": main()