added locations and changed names for test_files
This commit is contained in:
parent
179ad20165
commit
c4c85ace6d
@ -4,7 +4,8 @@ from pdf2image import pdf2image
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
import timeit
|
||||
|
||||
from os import path
|
||||
from cv_analysis.locations import METADATA_TESTFILES, PNG_FOR_TESTING, PNG_FIGURES_DETECTED
|
||||
from cv_analysis.utils.detection import detect_large_coherent_structures
|
||||
from cv_analysis.utils.display import show_mpl
|
||||
from cv_analysis.utils.draw import draw_rectangles
|
||||
@ -84,18 +85,16 @@ def detect_figures_in_png(pdf_path, show=False):
|
||||
|
||||
|
||||
def detect_figures_in_test_files_png():
|
||||
file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv")
|
||||
path = "/home/lillian/ocr_docs/png_example_pages/"
|
||||
file_name = pd.read_csv(METADATA_TESTFILES)
|
||||
pages = []
|
||||
page_index = 0
|
||||
t0 = timeit.default_timer()
|
||||
for name in file_name.image_name:
|
||||
page = detect_figures_in_png(path+name+".png", page_index, show=False)
|
||||
page = detect_figures_in_png(path.join(PNG_FOR_TESTING, name+".png"))
|
||||
pages.append(Image.fromarray(page))
|
||||
t1 = timeit.default_timer()
|
||||
print(t1-t0)
|
||||
p1, p = pages[0], pages[1:]
|
||||
out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf"
|
||||
out_pdf_path = path.join(PNG_FIGURES_DETECTED, "fig_detectes.pdf")
|
||||
p1.save(
|
||||
out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
|
||||
out_pdf_path, "PDF", resolution=300.0, save_all=True, append_images=p
|
||||
)
|
||||
@ -12,7 +12,11 @@ LOG_FILE = "/tmp/log.log"
|
||||
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
|
||||
PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
|
||||
PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
|
||||
HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed")
|
||||
PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected")
|
||||
PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp")
|
||||
HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed")
|
||||
METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv")
|
||||
|
||||
|
||||
TEST_DIR = path.join(MODULE_DIR, "test")
|
||||
TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data")
|
||||
|
||||
@ -4,7 +4,7 @@ import pandas as pd
|
||||
from pdf2image import convert_from_path
|
||||
from itertools import chain
|
||||
import json
|
||||
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS
|
||||
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING
|
||||
from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files
|
||||
|
||||
def read_json(path):
|
||||
@ -56,7 +56,7 @@ def make_metadata_entry_maker():
|
||||
def make_metadata_entry(name, page_nr):
|
||||
nonlocal i
|
||||
i += 1
|
||||
return ["fig_table" + str(i), name, page_nr]
|
||||
return [f"fig_table{i:0>3}", name, page_nr]
|
||||
|
||||
return make_metadata_entry
|
||||
|
||||
@ -69,14 +69,14 @@ def split_pdf(example_pages):
|
||||
images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0],
|
||||
last_page=pages[1])
|
||||
for image in images:
|
||||
fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png")
|
||||
fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png")
|
||||
image.save(fp=fp, dpi=(300, 300))
|
||||
i += 1
|
||||
|
||||
def rename_files_with_hash(example_pages,hashes):
|
||||
def rename_files_with_hash(example_pages, hashes):
|
||||
|
||||
files_to_rename = list(example_pages.keys())
|
||||
folder = HASHED_PDFS
|
||||
folder = HASHED_PDFS_FOR_TESTING
|
||||
|
||||
# Iterate through the folder
|
||||
for file in os.listdir(folder):
|
||||
@ -107,9 +107,9 @@ def hash_pdfs(example_pages):
|
||||
|
||||
def main():
|
||||
examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
|
||||
examples_pages = hash_pdfs(examples_pages)
|
||||
# examples_pages = hash_pdfs(examples_pages)
|
||||
collect_metadata(examples_pages, save=True)
|
||||
#split_pdf(examples_pages)
|
||||
split_pdf(examples_pages)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -3,7 +3,7 @@ import argparse
|
||||
from cv_analysis.table_parsing import annotate_tables_in_pdf
|
||||
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
|
||||
from cv_analysis.layout_parsing import annotate_layout_in_pdf
|
||||
from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files
|
||||
from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files_png
|
||||
from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing
|
||||
|
||||
|
||||
@ -30,4 +30,4 @@ if __name__ == "__main__":
|
||||
elif args.type == "figure":
|
||||
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
|
||||
elif args.type == "figures":
|
||||
detect_figures_in_test_files()
|
||||
detect_figures_in_test_files_png()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user