cv-analysis-service/scripts/export_example_pages.py
Julius Unverfehrt 0a11471191 feat(opentel,dynaconf): adapt new pyinfra
This commit also disables a broken test that connot be fixed. There are
also many scripts that didn't work anyways (and are not needed in my
eyes) that were not updatet. The scripts that are needed to run the
service processing locally still work.
2024-02-08 11:19:33 +01:00

114 lines
3.2 KiB
Python

import hashlib
import json
import os
from itertools import chain
from os import path
import pandas as pd
from pdf2image import convert_from_path
from cv_analysis.config import get_config
settings = get_config()
def read_json(path):
with open(path, encoding="utf-8") as file:
data = json.load(file)
return data
def collect_metadata(example_pages, save=False):
metadata = []
make_metadata_entry = make_metadata_entry_maker()
for name, document_sections in example_pages.items():
metadata.append(f(name, document_sections, make_metadata_entry))
metadata = list(chain.from_iterable(metadata))
if save:
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
df.to_csv(path.join(settings.paths.dvc_data_dir, "metadata_testing_files.csv"))
else:
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
def f(name, document_sections, make_metadata_entry):
for pages in document_sections:
span = list(range(pages[0], pages[1] + 1))
for page_nr in span:
yield make_metadata_entry(name, page_nr)
def make_metadata_entry_maker():
i = -1
def make_metadata_entry(name, page_nr):
nonlocal i
i += 1
return [f"fig_table{i:0>3}", name, page_nr]
return make_metadata_entry
def split_pdf(example_pages):
dir_path = settings.paths.pdf_for_testing
i = 0
for name, document_sections in example_pages.items():
for pages in document_sections:
images = convert_from_path(
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
)
for image in images:
fp = path.join(settings.paths.png_for_testing, f"fig_table{i:0>3}.png")
image.save(fp=fp, dpi=(300, 300))
i += 1
def find_hash(file_path):
BLOCK_SIZE = 65536
file_hash = hashlib.sha256()
with open(file_path, "rb") as f:
fb = f.read(BLOCK_SIZE)
while len(fb) > 0:
file_hash.update(fb)
fb = f.read(BLOCK_SIZE)
return file_hash.hexdigest()
def rename_files_with_hash(example_pages):
files_to_rename = list(example_pages.keys())
folder = settings.paths.hashed_pdfs_for_testing
# Iterate through the folder
for file in os.listdir(folder):
# Checking if the file is present in the list
if file in files_to_rename:
# construct current name using file name and path
old_name = path.join(folder, file)
# get file name without extension
only_name = path.splitext(file)[0]
# Adding the new name with extension
hash = find_hash(old_name)
# construct full file path
new_name = path.join(folder, hash + ".pdf")
# Renaming the file
os.rename(old_name, new_name)
# verify the result
res = os.listdir(folder)
print(res)
def main():
examples_pages = read_json(path.join(settings.paths.test_data_dir, "example_pages.json"))
rename_files_with_hash(examples_pages)
# collect_metadata(examples_pages, save=True)
# split_pdf(examples_pages)
if __name__ == "__main__":
main()