import hashlib import json import os from itertools import chain from os import path import pandas as pd from pdf2image import convert_from_path from cv_analysis.config import get_config settings = get_config() def read_json(path): with open(path, encoding="utf-8") as file: data = json.load(file) return data def collect_metadata(example_pages, save=False): metadata = [] make_metadata_entry = make_metadata_entry_maker() for name, document_sections in example_pages.items(): metadata.append(f(name, document_sections, make_metadata_entry)) metadata = list(chain.from_iterable(metadata)) if save: df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) df.to_csv(path.join(settings.paths.dvc_data_dir, "metadata_testing_files.csv")) else: return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) def f(name, document_sections, make_metadata_entry): for pages in document_sections: span = list(range(pages[0], pages[1] + 1)) for page_nr in span: yield make_metadata_entry(name, page_nr) def make_metadata_entry_maker(): i = -1 def make_metadata_entry(name, page_nr): nonlocal i i += 1 return [f"fig_table{i:0>3}", name, page_nr] return make_metadata_entry def split_pdf(example_pages): dir_path = settings.paths.pdf_for_testing i = 0 for name, document_sections in example_pages.items(): for pages in document_sections: images = convert_from_path( pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1] ) for image in images: fp = path.join(settings.paths.png_for_testing, f"fig_table{i:0>3}.png") image.save(fp=fp, dpi=(300, 300)) i += 1 def find_hash(file_path): BLOCK_SIZE = 65536 file_hash = hashlib.sha256() with open(file_path, "rb") as f: fb = f.read(BLOCK_SIZE) while len(fb) > 0: file_hash.update(fb) fb = f.read(BLOCK_SIZE) return file_hash.hexdigest() def rename_files_with_hash(example_pages): files_to_rename = list(example_pages.keys()) folder = settings.paths.hashed_pdfs_for_testing # Iterate through the folder for file in os.listdir(folder): # Checking if the file is present in the list if file in files_to_rename: # construct current name using file name and path old_name = path.join(folder, file) # get file name without extension only_name = path.splitext(file)[0] # Adding the new name with extension hash = find_hash(old_name) # construct full file path new_name = path.join(folder, hash + ".pdf") # Renaming the file os.rename(old_name, new_name) # verify the result res = os.listdir(folder) print(res) def main(): examples_pages = read_json(path.join(settings.paths.test_data_dir, "example_pages.json")) rename_files_with_hash(examples_pages) # collect_metadata(examples_pages, save=True) # split_pdf(examples_pages) if __name__ == "__main__": main()