minor changes, refactoring and testfiles added

2022-05-17 09:17:24 +02:00 · 2022-05-17 09:17:24 +02:00 · 179ad20165
commit 179ad20165
parent 0e30e97f80
12 changed files with 204 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,7 +13,6 @@ build_venv/
 /.idea/table_parsing.iml
 /.idea/vcs.xml
 /results/
-/data
 /table_parsing.egg-info
 /target/
 /tests/
@ -22,3 +21,5 @@ build_venv/
 /cv_analysis.egg-info/SOURCES.txt
 /cv_analysis.egg-info/top_level.txt
 /.vscode/
+/cv_analysis/test/test_data/example_pages.json
+/data/metadata_testing_files.csv
--- a/config.yaml
+++ b/config.yaml
@ -23,5 +23,5 @@ deskew:
 test_dummy: test_dummy

 visual_logging:
-  level: $LOGGING_LEVEL_ROOT|INFO
+  level: $LOGGING_LEVEL_ROOT|DEBUG
  output_folder: /tmp/debug/
--- a/cv_analysis/fig_detection_with_layout.py
+++ b/cv_analysis/fig_detection_with_layout.py
@ -55,18 +55,4 @@ def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False):
    else:
        return page

-# pages = []
-# for i in range(0,16):
-#     pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
-#     page_index = i
-#     layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
-#     big_structures, small_structures = cut_out_content_structures(layout_rects, page)
-#     page = parse_content_structures(page, big_structures, small_structures)
-#     pages.append(Image.fromarray(page))
-# p1, p = pages[0], pages[1:]
-#
-# out_pdf_path = "/home/lillian/ocr_docs/out1.pdf"
-#
-# p1.save(
-#     out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
-# )
+
--- a/cv_analysis/figure_detection.py
+++ b/cv_analysis/figure_detection.py
@ -1,6 +1,9 @@
 import cv2
 import numpy as np
 from pdf2image import pdf2image
+import pandas as pd
+from PIL import Image
+import timeit

 from cv_analysis.utils.detection import detect_large_coherent_structures
 from cv_analysis.utils.display import show_mpl
@ -33,7 +36,7 @@ def detect_figures(image: np.array):

 def detect_figures_in_pdf(pdf_path, page_index=1, show=False):

-    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
+    page = pdf2image.convert_from_path(pdf_path, dpi=300, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)

    redaction_contours = detect_figures(page)
@ -43,16 +46,56 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
        show_mpl(page)
    return page

-# pages = []
-# for i in range(0,16):
-#     pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
-#     page_index = i
-#     page = detect_figures_in_pdf(pdf_path,page_index)
-#     pages.append(Image.fromarray(page))
-# p1, p = pages[0], pages[1:]
-#
-# out_pdf_path = "/home/lillian/ocr_docs/out.pdf"
-#
-# p1.save(
-#     out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
-# )
+
+def detect_figures_in_test_files():
+    def save_as_pdf(pages):
+        p1, p = pages[0], pages[1:]
+        out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_pdf.pdf"
+        p1.save(
+            out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
+        )
+    path = "/home/lillian/ocr_docs/"
+    ex_pages = pd.read_csv(path+"/metadata/metadata2.csv")
+    pages_detected = []
+
+    t0 = timeit.default_timer()
+    for name, page_nr in zip(ex_pages.pdf_name, ex_pages.page):
+        page = pdf2image.convert_from_path(path + "/original/" + name, dpi=300, first_page=page_nr, last_page=page_nr)[0]
+        page = np.array(page)
+        redaction_contours = detect_figures(page)
+        page = draw_rectangles(page, redaction_contours)
+        pages_detected.append(Image.fromarray(page))
+    print(timeit.default_timer()-t0)
+
+    save_as_pdf(pages_detected)
+
+
+def detect_figures_in_png(pdf_path, show=False):
+
+    page = Image.open(pdf_path)
+    page = np.array(page)
+
+    redaction_contours = detect_figures(page)
+    page = draw_rectangles(page, redaction_contours)
+    vizlogger.debug(page, "figures03_final.png")
+    if show:
+        show_mpl(page)
+    return page
+
+
+def detect_figures_in_test_files_png():
+    file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv")
+    path = "/home/lillian/ocr_docs/png_example_pages/"
+    pages = []
+    page_index = 0
+    t0 = timeit.default_timer()
+    for name in file_name.image_name:
+        page = detect_figures_in_png(path+name+".png", page_index, show=False)
+        pages.append(Image.fromarray(page))
+    t1 = timeit.default_timer()
+    print(t1-t0)
+    p1, p = pages[0], pages[1:]
+    out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf"
+    p1.save(
+        out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
+    )
--- a/cv_analysis/locations.py
+++ b/cv_analysis/locations.py
@ -10,6 +10,9 @@ CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
 LOG_FILE = "/tmp/log.log"

 DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
+PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
+PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
+HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed")

 TEST_DIR = path.join(MODULE_DIR, "test")
 TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data")
--- a/cv_analysis/test/scripts/export_example_pages.py
+++ b/cv_analysis/test/scripts/export_example_pages.py
@ -0,0 +1,116 @@
+import os
+from os import path
+import pandas as pd
+from pdf2image import convert_from_path
+from itertools import chain
+import json
+from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS
+from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files
+
+def read_json(path):
+    with open(path, encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+
+
+# def collect_metadata(example_pages, save=False):
+#     metadata = []
+#     i = 0
+#     for name, document_sections in example_pages.items():
+#         for pages in document_sections:
+#             span = list(range(pages[0], pages[1] + 1))
+#             for page_nr in span:
+#                 metadata.append(["fig_table" + str(i), name, page_nr])
+#                 i += 1
+#     if save:
+#         df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
+#         df.to_csv("/exported_files/test_pages.csv")
+#     else:
+#         return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
+
+
+
+def collect_metadata(example_pages, save=False):
+    metadata = []
+    make_metadata_entry = make_metadata_entry_maker()
+    for name, document_sections in example_pages.items():
+        metadata.append(f(name, document_sections, make_metadata_entry))
+    metadata = list(chain.from_iterable(metadata))
+    if save:
+        df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
+        df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv"))
+    else:
+        return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
+
+
+def f(name, document_sections, make_metadata_entry):
+    for pages in document_sections:
+        span = list(range(pages[0], pages[1] + 1))
+        for page_nr in span:
+            yield make_metadata_entry(name, page_nr)
+
+
+def make_metadata_entry_maker():
+    i = -1
+
+    def make_metadata_entry(name, page_nr):
+        nonlocal i
+        i += 1
+        return ["fig_table" + str(i), name, page_nr]
+
+    return make_metadata_entry
+
+
+def split_pdf(example_pages):
+    dir_path = PDF_FOR_TESTING
+    i = 0
+    for name, document_sections in example_pages.items():
+        for pages in document_sections:
+            images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0],
+                                       last_page=pages[1])
+            for image in images:
+                fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png")
+                image.save(fp=fp, dpi=(300, 300))
+                i += 1
+
+def rename_files_with_hash(example_pages,hashes):
+
+    files_to_rename = list(example_pages.keys())
+    folder = HASHED_PDFS
+
+    # Iterate through the folder
+    for file in os.listdir(folder):
+        # Checking if the file is present in the list
+        if file in files_to_rename:
+            # construct current name using file name and path
+            old_name = path.join(folder, file)
+            # get file name without extension
+            only_name = path.splitext(file)[0]
+
+            # Adding the new name with extension
+            new_base = only_name + '_new' + '.txt'
+            # construct full file path
+            new_name = path.join(folder, new_base)
+
+            # Renaming the file
+            os.rename(old_name, new_name)
+
+    # verify the result
+    res = os.listdir(folder)
+    print(res)
+
+def hash_pdfs(example_pages):
+    pdf_paths = list(path.join(PDF_FOR_TESTING, pdf_name) for pdf_name in example_pages.keys())
+    hashes = hash_pdf_files(paths=pdf_paths, verbose=0)
+    example_pages = dict(zip(hashes, example_pages.values()))
+    return example_pages
+
+def main():
+    examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
+    examples_pages = hash_pdfs(examples_pages)
+    collect_metadata(examples_pages, save=True)
+    #split_pdf(examples_pages)
+
+
+if __name__ == "__main__":
+    main()
--- a/cv_analysis/utils/post_processing.py
+++ b/cv_analysis/utils/post_processing.py
@ -25,7 +25,7 @@ def remove_included(rectangles):
        return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax

    def is_not_included(rect, rectangles):
-        return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
+        return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)

    rectangles = list(map(xywh_to_vec_rect, rectangles))
    rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
--- a/cv_analysis/utils/text.py
+++ b/cv_analysis/utils/text.py
@ -47,8 +47,8 @@ def find_primary_text_regions(image):

    image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

-    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) #20,3
-    close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=2)
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3
+    close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)

    #show_mpl(close)

--- a/data/.gitignore
+++ b/data/.gitignore
@ -1 +1,7 @@
 /test_pdf.pdf
+/pdfs_for_testing
+/figure_detection.png
+/layout_parsing.png
+/redaction_detection.png
+/table_parsing.png
+/pngs_for_testing
--- a/data/pdfs_for_testing.dvc
+++ b/data/pdfs_for_testing.dvc
@ -0,0 +1,5 @@
+outs:
+- md5: bb0ce084f7ca54583972da71cb87e22c.dir
+  size: 367181628
+  nfiles: 28
+  path: pdfs_for_testing
--- a/data/pngs_for_testing.dvc
+++ b/data/pngs_for_testing.dvc
@ -0,0 +1,5 @@
+outs:
+- md5: 4fed91116111b47edf1c6f6a67eb84d3.dir
+  size: 58125058
+  nfiles: 230
+  path: pngs_for_testing
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@ -3,7 +3,7 @@ import argparse
 from cv_analysis.table_parsing import annotate_tables_in_pdf
 from cv_analysis.redaction_detection import annotate_redactions_in_pdf
 from cv_analysis.layout_parsing import annotate_layout_in_pdf
-from cv_analysis.figure_detection import detect_figures_in_pdf
+from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files
 from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing


@ -11,7 +11,7 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("pdf_path")
    parser.add_argument("page_index", type=int)
-    parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"])
+    parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figures"])
    parser.add_argument("--show", action="store_true", default=False)

    args = parser.parse_args()
@ -28,6 +28,6 @@ if __name__ == "__main__":
    elif args.type == "layout":
        annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show)
    elif args.type == "figure":
-        detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show)
-    elif args.type == "figure2":
-        detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=args.show)
+        detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
+    elif args.type == "figures":
+        detect_figures_in_test_files()