From 179ad2016570af7e0a2ac8a6de1bcca8b4777acf Mon Sep 17 00:00:00 2001 From: llocarnini Date: Tue, 17 May 2022 09:17:24 +0200 Subject: [PATCH] minor changes, refactoring and testfiles added --- .gitignore | 3 +- config.yaml | 2 +- cv_analysis/fig_detection_with_layout.py | 16 +-- cv_analysis/figure_detection.py | 71 ++++++++--- cv_analysis/locations.py | 3 + .../test/scripts/export_example_pages.py | 116 ++++++++++++++++++ cv_analysis/utils/post_processing.py | 2 +- cv_analysis/utils/text.py | 4 +- data/.gitignore | 6 + data/pdfs_for_testing.dvc | 5 + data/pngs_for_testing.dvc | 5 + scripts/annotate.py | 10 +- 12 files changed, 204 insertions(+), 39 deletions(-) create mode 100644 cv_analysis/test/scripts/export_example_pages.py create mode 100644 data/pdfs_for_testing.dvc create mode 100644 data/pngs_for_testing.dvc diff --git a/.gitignore b/.gitignore index f3b659b..5360d78 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ build_venv/ /.idea/table_parsing.iml /.idea/vcs.xml /results/ -/data /table_parsing.egg-info /target/ /tests/ @@ -22,3 +21,5 @@ build_venv/ /cv_analysis.egg-info/SOURCES.txt /cv_analysis.egg-info/top_level.txt /.vscode/ +/cv_analysis/test/test_data/example_pages.json +/data/metadata_testing_files.csv diff --git a/config.yaml b/config.yaml index 42bd2e7..fc6bb42 100644 --- a/config.yaml +++ b/config.yaml @@ -23,5 +23,5 @@ deskew: test_dummy: test_dummy visual_logging: - level: $LOGGING_LEVEL_ROOT|INFO + level: $LOGGING_LEVEL_ROOT|DEBUG output_folder: /tmp/debug/ \ No newline at end of file diff --git a/cv_analysis/fig_detection_with_layout.py b/cv_analysis/fig_detection_with_layout.py index 7f16244..ce1d71b 100644 --- a/cv_analysis/fig_detection_with_layout.py +++ b/cv_analysis/fig_detection_with_layout.py @@ -55,18 +55,4 @@ def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False): else: return page -# pages = [] -# for i in range(0,16): -# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf" -# page_index = i -# layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True) -# big_structures, small_structures = cut_out_content_structures(layout_rects, page) -# page = parse_content_structures(page, big_structures, small_structures) -# pages.append(Image.fromarray(page)) -# p1, p = pages[0], pages[1:] -# -# out_pdf_path = "/home/lillian/ocr_docs/out1.pdf" -# -# p1.save( -# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p -# ) + diff --git a/cv_analysis/figure_detection.py b/cv_analysis/figure_detection.py index a7db151..468500f 100644 --- a/cv_analysis/figure_detection.py +++ b/cv_analysis/figure_detection.py @@ -1,6 +1,9 @@ import cv2 import numpy as np from pdf2image import pdf2image +import pandas as pd +from PIL import Image +import timeit from cv_analysis.utils.detection import detect_large_coherent_structures from cv_analysis.utils.display import show_mpl @@ -33,7 +36,7 @@ def detect_figures(image: np.array): def detect_figures_in_pdf(pdf_path, page_index=1, show=False): - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = pdf2image.convert_from_path(pdf_path, dpi=300, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) redaction_contours = detect_figures(page) @@ -43,16 +46,56 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False): show_mpl(page) return page -# pages = [] -# for i in range(0,16): -# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf" -# page_index = i -# page = detect_figures_in_pdf(pdf_path,page_index) -# pages.append(Image.fromarray(page)) -# p1, p = pages[0], pages[1:] -# -# out_pdf_path = "/home/lillian/ocr_docs/out.pdf" -# -# p1.save( -# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p -# ) \ No newline at end of file + +def detect_figures_in_test_files(): + def save_as_pdf(pages): + p1, p = pages[0], pages[1:] + out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_pdf.pdf" + p1.save( + out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p + ) + path = "/home/lillian/ocr_docs/" + ex_pages = pd.read_csv(path+"/metadata/metadata2.csv") + pages_detected = [] + + t0 = timeit.default_timer() + for name, page_nr in zip(ex_pages.pdf_name, ex_pages.page): + page = pdf2image.convert_from_path(path + "/original/" + name, dpi=300, first_page=page_nr, last_page=page_nr)[0] + page = np.array(page) + redaction_contours = detect_figures(page) + page = draw_rectangles(page, redaction_contours) + pages_detected.append(Image.fromarray(page)) + print(timeit.default_timer()-t0) + + save_as_pdf(pages_detected) + + +def detect_figures_in_png(pdf_path, show=False): + + page = Image.open(pdf_path) + page = np.array(page) + + redaction_contours = detect_figures(page) + page = draw_rectangles(page, redaction_contours) + vizlogger.debug(page, "figures03_final.png") + if show: + show_mpl(page) + return page + + +def detect_figures_in_test_files_png(): + file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv") + path = "/home/lillian/ocr_docs/png_example_pages/" + pages = [] + page_index = 0 + t0 = timeit.default_timer() + for name in file_name.image_name: + page = detect_figures_in_png(path+name+".png", page_index, show=False) + pages.append(Image.fromarray(page)) + t1 = timeit.default_timer() + print(t1-t0) + p1, p = pages[0], pages[1:] + out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf" + p1.save( + out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p + ) \ No newline at end of file diff --git a/cv_analysis/locations.py b/cv_analysis/locations.py index 6e56ec1..fee3248 100644 --- a/cv_analysis/locations.py +++ b/cv_analysis/locations.py @@ -10,6 +10,9 @@ CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml") LOG_FILE = "/tmp/log.log" DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data") +PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing") +PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing") +HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed") TEST_DIR = path.join(MODULE_DIR, "test") TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data") diff --git a/cv_analysis/test/scripts/export_example_pages.py b/cv_analysis/test/scripts/export_example_pages.py new file mode 100644 index 0000000..5aff47e --- /dev/null +++ b/cv_analysis/test/scripts/export_example_pages.py @@ -0,0 +1,116 @@ +import os +from os import path +import pandas as pd +from pdf2image import convert_from_path +from itertools import chain +import json +from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS +from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files + +def read_json(path): + with open(path, encoding='utf-8') as file: + data = json.load(file) + return data + + +# def collect_metadata(example_pages, save=False): +# metadata = [] +# i = 0 +# for name, document_sections in example_pages.items(): +# for pages in document_sections: +# span = list(range(pages[0], pages[1] + 1)) +# for page_nr in span: +# metadata.append(["fig_table" + str(i), name, page_nr]) +# i += 1 +# if save: +# df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) +# df.to_csv("/exported_files/test_pages.csv") +# else: +# return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) + + + +def collect_metadata(example_pages, save=False): + metadata = [] + make_metadata_entry = make_metadata_entry_maker() + for name, document_sections in example_pages.items(): + metadata.append(f(name, document_sections, make_metadata_entry)) + metadata = list(chain.from_iterable(metadata)) + if save: + df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) + df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv")) + else: + return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) + + +def f(name, document_sections, make_metadata_entry): + for pages in document_sections: + span = list(range(pages[0], pages[1] + 1)) + for page_nr in span: + yield make_metadata_entry(name, page_nr) + + +def make_metadata_entry_maker(): + i = -1 + + def make_metadata_entry(name, page_nr): + nonlocal i + i += 1 + return ["fig_table" + str(i), name, page_nr] + + return make_metadata_entry + + +def split_pdf(example_pages): + dir_path = PDF_FOR_TESTING + i = 0 + for name, document_sections in example_pages.items(): + for pages in document_sections: + images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], + last_page=pages[1]) + for image in images: + fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png") + image.save(fp=fp, dpi=(300, 300)) + i += 1 + +def rename_files_with_hash(example_pages,hashes): + + files_to_rename = list(example_pages.keys()) + folder = HASHED_PDFS + + # Iterate through the folder + for file in os.listdir(folder): + # Checking if the file is present in the list + if file in files_to_rename: + # construct current name using file name and path + old_name = path.join(folder, file) + # get file name without extension + only_name = path.splitext(file)[0] + + # Adding the new name with extension + new_base = only_name + '_new' + '.txt' + # construct full file path + new_name = path.join(folder, new_base) + + # Renaming the file + os.rename(old_name, new_name) + + # verify the result + res = os.listdir(folder) + print(res) + +def hash_pdfs(example_pages): + pdf_paths = list(path.join(PDF_FOR_TESTING, pdf_name) for pdf_name in example_pages.keys()) + hashes = hash_pdf_files(paths=pdf_paths, verbose=0) + example_pages = dict(zip(hashes, example_pages.values())) + return example_pages + +def main(): + examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json")) + examples_pages = hash_pdfs(examples_pages) + collect_metadata(examples_pages, save=True) + #split_pdf(examples_pages) + + +if __name__ == "__main__": + main() diff --git a/cv_analysis/utils/post_processing.py b/cv_analysis/utils/post_processing.py index 753e091..1749f2d 100644 --- a/cv_analysis/utils/post_processing.py +++ b/cv_analysis/utils/post_processing.py @@ -25,7 +25,7 @@ def remove_included(rectangles): return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax def is_not_included(rect, rectangles): - return not any(included(r2, rect) for r2 in rectangles if not rect == r2) + return not any(includes(r2, rect) for r2 in rectangles if not rect == r2) rectangles = list(map(xywh_to_vec_rect, rectangles)) rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles) diff --git a/cv_analysis/utils/text.py b/cv_analysis/utils/text.py index 01f6c4b..5d9ccaa 100644 --- a/cv_analysis/utils/text.py +++ b/cv_analysis/utils/text.py @@ -47,8 +47,8 @@ def find_primary_text_regions(image): image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) #20,3 - close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=2) + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3 + close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1) #show_mpl(close) diff --git a/data/.gitignore b/data/.gitignore index 09d8485..7b38b1e 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -1 +1,7 @@ /test_pdf.pdf +/pdfs_for_testing +/figure_detection.png +/layout_parsing.png +/redaction_detection.png +/table_parsing.png +/pngs_for_testing diff --git a/data/pdfs_for_testing.dvc b/data/pdfs_for_testing.dvc new file mode 100644 index 0000000..e85e518 --- /dev/null +++ b/data/pdfs_for_testing.dvc @@ -0,0 +1,5 @@ +outs: +- md5: bb0ce084f7ca54583972da71cb87e22c.dir + size: 367181628 + nfiles: 28 + path: pdfs_for_testing diff --git a/data/pngs_for_testing.dvc b/data/pngs_for_testing.dvc new file mode 100644 index 0000000..630eab7 --- /dev/null +++ b/data/pngs_for_testing.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 4fed91116111b47edf1c6f6a67eb84d3.dir + size: 58125058 + nfiles: 230 + path: pngs_for_testing diff --git a/scripts/annotate.py b/scripts/annotate.py index 35310bf..34b007c 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -3,7 +3,7 @@ import argparse from cv_analysis.table_parsing import annotate_tables_in_pdf from cv_analysis.redaction_detection import annotate_redactions_in_pdf from cv_analysis.layout_parsing import annotate_layout_in_pdf -from cv_analysis.figure_detection import detect_figures_in_pdf +from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing @@ -11,7 +11,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("pdf_path") parser.add_argument("page_index", type=int) - parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"]) + parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figures"]) parser.add_argument("--show", action="store_true", default=False) args = parser.parse_args() @@ -28,6 +28,6 @@ if __name__ == "__main__": elif args.type == "layout": annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show) elif args.type == "figure": - detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show) - elif args.type == "figure2": - detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=args.show) + detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True) + elif args.type == "figures": + detect_figures_in_test_files()