This commit also disables a broken test that connot be fixed. There are also many scripts that didn't work anyways (and are not needed in my eyes) that were not updatet. The scripts that are needed to run the service processing locally still work.
114 lines
3.2 KiB
Python
114 lines
3.2 KiB
Python
import hashlib
|
|
import json
|
|
import os
|
|
from itertools import chain
|
|
from os import path
|
|
|
|
import pandas as pd
|
|
from pdf2image import convert_from_path
|
|
|
|
from cv_analysis.config import get_config
|
|
|
|
settings = get_config()
|
|
|
|
|
|
def read_json(path):
|
|
with open(path, encoding="utf-8") as file:
|
|
data = json.load(file)
|
|
return data
|
|
|
|
|
|
def collect_metadata(example_pages, save=False):
|
|
metadata = []
|
|
make_metadata_entry = make_metadata_entry_maker()
|
|
for name, document_sections in example_pages.items():
|
|
metadata.append(f(name, document_sections, make_metadata_entry))
|
|
metadata = list(chain.from_iterable(metadata))
|
|
if save:
|
|
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
|
df.to_csv(path.join(settings.paths.dvc_data_dir, "metadata_testing_files.csv"))
|
|
else:
|
|
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
|
|
|
|
|
def f(name, document_sections, make_metadata_entry):
|
|
for pages in document_sections:
|
|
span = list(range(pages[0], pages[1] + 1))
|
|
for page_nr in span:
|
|
yield make_metadata_entry(name, page_nr)
|
|
|
|
|
|
def make_metadata_entry_maker():
|
|
i = -1
|
|
|
|
def make_metadata_entry(name, page_nr):
|
|
nonlocal i
|
|
i += 1
|
|
return [f"fig_table{i:0>3}", name, page_nr]
|
|
|
|
return make_metadata_entry
|
|
|
|
|
|
def split_pdf(example_pages):
|
|
dir_path = settings.paths.pdf_for_testing
|
|
i = 0
|
|
for name, document_sections in example_pages.items():
|
|
for pages in document_sections:
|
|
images = convert_from_path(
|
|
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
|
|
)
|
|
for image in images:
|
|
fp = path.join(settings.paths.png_for_testing, f"fig_table{i:0>3}.png")
|
|
image.save(fp=fp, dpi=(300, 300))
|
|
i += 1
|
|
|
|
|
|
def find_hash(file_path):
|
|
BLOCK_SIZE = 65536
|
|
|
|
file_hash = hashlib.sha256()
|
|
with open(file_path, "rb") as f:
|
|
fb = f.read(BLOCK_SIZE)
|
|
while len(fb) > 0:
|
|
file_hash.update(fb)
|
|
fb = f.read(BLOCK_SIZE)
|
|
|
|
return file_hash.hexdigest()
|
|
|
|
|
|
def rename_files_with_hash(example_pages):
|
|
files_to_rename = list(example_pages.keys())
|
|
folder = settings.paths.hashed_pdfs_for_testing
|
|
|
|
# Iterate through the folder
|
|
for file in os.listdir(folder):
|
|
# Checking if the file is present in the list
|
|
if file in files_to_rename:
|
|
# construct current name using file name and path
|
|
old_name = path.join(folder, file)
|
|
# get file name without extension
|
|
only_name = path.splitext(file)[0]
|
|
|
|
# Adding the new name with extension
|
|
hash = find_hash(old_name)
|
|
# construct full file path
|
|
new_name = path.join(folder, hash + ".pdf")
|
|
|
|
# Renaming the file
|
|
os.rename(old_name, new_name)
|
|
|
|
# verify the result
|
|
res = os.listdir(folder)
|
|
print(res)
|
|
|
|
|
|
def main():
|
|
examples_pages = read_json(path.join(settings.paths.test_data_dir, "example_pages.json"))
|
|
rename_files_with_hash(examples_pages)
|
|
# collect_metadata(examples_pages, save=True)
|
|
# split_pdf(examples_pages)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|