added locations and changed names for test_files

This commit is contained in:
llocarnini 2022-05-24 09:31:29 +02:00
parent 179ad20165
commit c4c85ace6d
4 changed files with 20 additions and 17 deletions

View File

@ -4,7 +4,8 @@ from pdf2image import pdf2image
import pandas as pd
from PIL import Image
import timeit
from os import path
from cv_analysis.locations import METADATA_TESTFILES, PNG_FOR_TESTING, PNG_FIGURES_DETECTED
from cv_analysis.utils.detection import detect_large_coherent_structures
from cv_analysis.utils.display import show_mpl
from cv_analysis.utils.draw import draw_rectangles
@ -84,18 +85,16 @@ def detect_figures_in_png(pdf_path, show=False):
def detect_figures_in_test_files_png():
file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv")
path = "/home/lillian/ocr_docs/png_example_pages/"
file_name = pd.read_csv(METADATA_TESTFILES)
pages = []
page_index = 0
t0 = timeit.default_timer()
for name in file_name.image_name:
page = detect_figures_in_png(path+name+".png", page_index, show=False)
page = detect_figures_in_png(path.join(PNG_FOR_TESTING, name+".png"))
pages.append(Image.fromarray(page))
t1 = timeit.default_timer()
print(t1-t0)
p1, p = pages[0], pages[1:]
out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf"
out_pdf_path = path.join(PNG_FIGURES_DETECTED, "fig_detectes.pdf")
p1.save(
out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
out_pdf_path, "PDF", resolution=300.0, save_all=True, append_images=p
)

View File

@ -12,7 +12,11 @@ LOG_FILE = "/tmp/log.log"
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed")
PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected")
PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp")
HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed")
METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv")
TEST_DIR = path.join(MODULE_DIR, "test")
TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data")

View File

@ -4,7 +4,7 @@ import pandas as pd
from pdf2image import convert_from_path
from itertools import chain
import json
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING
from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files
def read_json(path):
@ -56,7 +56,7 @@ def make_metadata_entry_maker():
def make_metadata_entry(name, page_nr):
nonlocal i
i += 1
return ["fig_table" + str(i), name, page_nr]
return [f"fig_table{i:0>3}", name, page_nr]
return make_metadata_entry
@ -69,14 +69,14 @@ def split_pdf(example_pages):
images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0],
last_page=pages[1])
for image in images:
fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png")
fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png")
image.save(fp=fp, dpi=(300, 300))
i += 1
def rename_files_with_hash(example_pages,hashes):
def rename_files_with_hash(example_pages, hashes):
files_to_rename = list(example_pages.keys())
folder = HASHED_PDFS
folder = HASHED_PDFS_FOR_TESTING
# Iterate through the folder
for file in os.listdir(folder):
@ -107,9 +107,9 @@ def hash_pdfs(example_pages):
def main():
examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
examples_pages = hash_pdfs(examples_pages)
# examples_pages = hash_pdfs(examples_pages)
collect_metadata(examples_pages, save=True)
#split_pdf(examples_pages)
split_pdf(examples_pages)
if __name__ == "__main__":

View File

@ -3,7 +3,7 @@ import argparse
from cv_analysis.table_parsing import annotate_tables_in_pdf
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
from cv_analysis.layout_parsing import annotate_layout_in_pdf
from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files
from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files_png
from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing
@ -30,4 +30,4 @@ if __name__ == "__main__":
elif args.type == "figure":
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
elif args.type == "figures":
detect_figures_in_test_files()
detect_figures_in_test_files_png()