diff --git a/cv_analysis/figure_detection.py b/cv_analysis/figure_detection.py index 468500f..32db8d5 100644 --- a/cv_analysis/figure_detection.py +++ b/cv_analysis/figure_detection.py @@ -4,7 +4,8 @@ from pdf2image import pdf2image import pandas as pd from PIL import Image import timeit - +from os import path +from cv_analysis.locations import METADATA_TESTFILES, PNG_FOR_TESTING, PNG_FIGURES_DETECTED from cv_analysis.utils.detection import detect_large_coherent_structures from cv_analysis.utils.display import show_mpl from cv_analysis.utils.draw import draw_rectangles @@ -84,18 +85,16 @@ def detect_figures_in_png(pdf_path, show=False): def detect_figures_in_test_files_png(): - file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv") - path = "/home/lillian/ocr_docs/png_example_pages/" + file_name = pd.read_csv(METADATA_TESTFILES) pages = [] - page_index = 0 t0 = timeit.default_timer() for name in file_name.image_name: - page = detect_figures_in_png(path+name+".png", page_index, show=False) + page = detect_figures_in_png(path.join(PNG_FOR_TESTING, name+".png")) pages.append(Image.fromarray(page)) t1 = timeit.default_timer() print(t1-t0) p1, p = pages[0], pages[1:] - out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf" + out_pdf_path = path.join(PNG_FIGURES_DETECTED, "fig_detectes.pdf") p1.save( - out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p + out_pdf_path, "PDF", resolution=300.0, save_all=True, append_images=p ) \ No newline at end of file diff --git a/cv_analysis/locations.py b/cv_analysis/locations.py index fee3248..07e15f6 100644 --- a/cv_analysis/locations.py +++ b/cv_analysis/locations.py @@ -12,7 +12,11 @@ LOG_FILE = "/tmp/log.log" DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data") PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing") PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing") -HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed") +PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected") +PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp") +HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed") +METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv") + TEST_DIR = path.join(MODULE_DIR, "test") TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data") diff --git a/cv_analysis/test/scripts/export_example_pages.py b/cv_analysis/test/scripts/export_example_pages.py index 5aff47e..65ffbb0 100644 --- a/cv_analysis/test/scripts/export_example_pages.py +++ b/cv_analysis/test/scripts/export_example_pages.py @@ -4,7 +4,7 @@ import pandas as pd from pdf2image import convert_from_path from itertools import chain import json -from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS +from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files def read_json(path): @@ -56,7 +56,7 @@ def make_metadata_entry_maker(): def make_metadata_entry(name, page_nr): nonlocal i i += 1 - return ["fig_table" + str(i), name, page_nr] + return [f"fig_table{i:0>3}", name, page_nr] return make_metadata_entry @@ -69,14 +69,14 @@ def split_pdf(example_pages): images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]) for image in images: - fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png") + fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png") image.save(fp=fp, dpi=(300, 300)) i += 1 -def rename_files_with_hash(example_pages,hashes): +def rename_files_with_hash(example_pages, hashes): files_to_rename = list(example_pages.keys()) - folder = HASHED_PDFS + folder = HASHED_PDFS_FOR_TESTING # Iterate through the folder for file in os.listdir(folder): @@ -107,9 +107,9 @@ def hash_pdfs(example_pages): def main(): examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json")) - examples_pages = hash_pdfs(examples_pages) + # examples_pages = hash_pdfs(examples_pages) collect_metadata(examples_pages, save=True) - #split_pdf(examples_pages) + split_pdf(examples_pages) if __name__ == "__main__": diff --git a/scripts/annotate.py b/scripts/annotate.py index 34b007c..8410ce2 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -3,7 +3,7 @@ import argparse from cv_analysis.table_parsing import annotate_tables_in_pdf from cv_analysis.redaction_detection import annotate_redactions_in_pdf from cv_analysis.layout_parsing import annotate_layout_in_pdf -from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files +from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files_png from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing @@ -30,4 +30,4 @@ if __name__ == "__main__": elif args.type == "figure": detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True) elif args.type == "figures": - detect_figures_in_test_files() + detect_figures_in_test_files_png()