minor changes, refactoring and testfiles added
This commit is contained in:
parent
0e30e97f80
commit
179ad20165
3
.gitignore
vendored
3
.gitignore
vendored
@ -13,7 +13,6 @@ build_venv/
|
||||
/.idea/table_parsing.iml
|
||||
/.idea/vcs.xml
|
||||
/results/
|
||||
/data
|
||||
/table_parsing.egg-info
|
||||
/target/
|
||||
/tests/
|
||||
@ -22,3 +21,5 @@ build_venv/
|
||||
/cv_analysis.egg-info/SOURCES.txt
|
||||
/cv_analysis.egg-info/top_level.txt
|
||||
/.vscode/
|
||||
/cv_analysis/test/test_data/example_pages.json
|
||||
/data/metadata_testing_files.csv
|
||||
|
||||
@ -23,5 +23,5 @@ deskew:
|
||||
test_dummy: test_dummy
|
||||
|
||||
visual_logging:
|
||||
level: $LOGGING_LEVEL_ROOT|INFO
|
||||
level: $LOGGING_LEVEL_ROOT|DEBUG
|
||||
output_folder: /tmp/debug/
|
||||
@ -55,18 +55,4 @@ def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False):
|
||||
else:
|
||||
return page
|
||||
|
||||
# pages = []
|
||||
# for i in range(0,16):
|
||||
# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
|
||||
# page_index = i
|
||||
# layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
|
||||
# big_structures, small_structures = cut_out_content_structures(layout_rects, page)
|
||||
# page = parse_content_structures(page, big_structures, small_structures)
|
||||
# pages.append(Image.fromarray(page))
|
||||
# p1, p = pages[0], pages[1:]
|
||||
#
|
||||
# out_pdf_path = "/home/lillian/ocr_docs/out1.pdf"
|
||||
#
|
||||
# p1.save(
|
||||
# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
|
||||
# )
|
||||
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pdf2image import pdf2image
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
import timeit
|
||||
|
||||
from cv_analysis.utils.detection import detect_large_coherent_structures
|
||||
from cv_analysis.utils.display import show_mpl
|
||||
@ -33,7 +36,7 @@ def detect_figures(image: np.array):
|
||||
|
||||
def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = pdf2image.convert_from_path(pdf_path, dpi=300, first_page=page_index + 1, last_page=page_index + 1)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = detect_figures(page)
|
||||
@ -43,16 +46,56 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
|
||||
show_mpl(page)
|
||||
return page
|
||||
|
||||
# pages = []
|
||||
# for i in range(0,16):
|
||||
# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
|
||||
# page_index = i
|
||||
# page = detect_figures_in_pdf(pdf_path,page_index)
|
||||
# pages.append(Image.fromarray(page))
|
||||
# p1, p = pages[0], pages[1:]
|
||||
#
|
||||
# out_pdf_path = "/home/lillian/ocr_docs/out.pdf"
|
||||
#
|
||||
# p1.save(
|
||||
# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
|
||||
# )
|
||||
|
||||
def detect_figures_in_test_files():
|
||||
def save_as_pdf(pages):
|
||||
p1, p = pages[0], pages[1:]
|
||||
out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_pdf.pdf"
|
||||
p1.save(
|
||||
out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
|
||||
)
|
||||
path = "/home/lillian/ocr_docs/"
|
||||
ex_pages = pd.read_csv(path+"/metadata/metadata2.csv")
|
||||
pages_detected = []
|
||||
|
||||
t0 = timeit.default_timer()
|
||||
for name, page_nr in zip(ex_pages.pdf_name, ex_pages.page):
|
||||
page = pdf2image.convert_from_path(path + "/original/" + name, dpi=300, first_page=page_nr, last_page=page_nr)[0]
|
||||
page = np.array(page)
|
||||
redaction_contours = detect_figures(page)
|
||||
page = draw_rectangles(page, redaction_contours)
|
||||
pages_detected.append(Image.fromarray(page))
|
||||
print(timeit.default_timer()-t0)
|
||||
|
||||
save_as_pdf(pages_detected)
|
||||
|
||||
|
||||
def detect_figures_in_png(pdf_path, show=False):
|
||||
|
||||
page = Image.open(pdf_path)
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = detect_figures(page)
|
||||
page = draw_rectangles(page, redaction_contours)
|
||||
vizlogger.debug(page, "figures03_final.png")
|
||||
if show:
|
||||
show_mpl(page)
|
||||
return page
|
||||
|
||||
|
||||
def detect_figures_in_test_files_png():
|
||||
file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv")
|
||||
path = "/home/lillian/ocr_docs/png_example_pages/"
|
||||
pages = []
|
||||
page_index = 0
|
||||
t0 = timeit.default_timer()
|
||||
for name in file_name.image_name:
|
||||
page = detect_figures_in_png(path+name+".png", page_index, show=False)
|
||||
pages.append(Image.fromarray(page))
|
||||
t1 = timeit.default_timer()
|
||||
print(t1-t0)
|
||||
p1, p = pages[0], pages[1:]
|
||||
out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf"
|
||||
p1.save(
|
||||
out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
|
||||
)
|
||||
@ -10,6 +10,9 @@ CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
|
||||
LOG_FILE = "/tmp/log.log"
|
||||
|
||||
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
|
||||
PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
|
||||
PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
|
||||
HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed")
|
||||
|
||||
TEST_DIR = path.join(MODULE_DIR, "test")
|
||||
TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data")
|
||||
|
||||
116
cv_analysis/test/scripts/export_example_pages.py
Normal file
116
cv_analysis/test/scripts/export_example_pages.py
Normal file
@ -0,0 +1,116 @@
|
||||
import os
|
||||
from os import path
|
||||
import pandas as pd
|
||||
from pdf2image import convert_from_path
|
||||
from itertools import chain
|
||||
import json
|
||||
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS
|
||||
from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files
|
||||
|
||||
def read_json(path):
|
||||
with open(path, encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
return data
|
||||
|
||||
|
||||
# def collect_metadata(example_pages, save=False):
|
||||
# metadata = []
|
||||
# i = 0
|
||||
# for name, document_sections in example_pages.items():
|
||||
# for pages in document_sections:
|
||||
# span = list(range(pages[0], pages[1] + 1))
|
||||
# for page_nr in span:
|
||||
# metadata.append(["fig_table" + str(i), name, page_nr])
|
||||
# i += 1
|
||||
# if save:
|
||||
# df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||
# df.to_csv("/exported_files/test_pages.csv")
|
||||
# else:
|
||||
# return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||
|
||||
|
||||
|
||||
def collect_metadata(example_pages, save=False):
|
||||
metadata = []
|
||||
make_metadata_entry = make_metadata_entry_maker()
|
||||
for name, document_sections in example_pages.items():
|
||||
metadata.append(f(name, document_sections, make_metadata_entry))
|
||||
metadata = list(chain.from_iterable(metadata))
|
||||
if save:
|
||||
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||
df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv"))
|
||||
else:
|
||||
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||
|
||||
|
||||
def f(name, document_sections, make_metadata_entry):
|
||||
for pages in document_sections:
|
||||
span = list(range(pages[0], pages[1] + 1))
|
||||
for page_nr in span:
|
||||
yield make_metadata_entry(name, page_nr)
|
||||
|
||||
|
||||
def make_metadata_entry_maker():
|
||||
i = -1
|
||||
|
||||
def make_metadata_entry(name, page_nr):
|
||||
nonlocal i
|
||||
i += 1
|
||||
return ["fig_table" + str(i), name, page_nr]
|
||||
|
||||
return make_metadata_entry
|
||||
|
||||
|
||||
def split_pdf(example_pages):
|
||||
dir_path = PDF_FOR_TESTING
|
||||
i = 0
|
||||
for name, document_sections in example_pages.items():
|
||||
for pages in document_sections:
|
||||
images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0],
|
||||
last_page=pages[1])
|
||||
for image in images:
|
||||
fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png")
|
||||
image.save(fp=fp, dpi=(300, 300))
|
||||
i += 1
|
||||
|
||||
def rename_files_with_hash(example_pages,hashes):
|
||||
|
||||
files_to_rename = list(example_pages.keys())
|
||||
folder = HASHED_PDFS
|
||||
|
||||
# Iterate through the folder
|
||||
for file in os.listdir(folder):
|
||||
# Checking if the file is present in the list
|
||||
if file in files_to_rename:
|
||||
# construct current name using file name and path
|
||||
old_name = path.join(folder, file)
|
||||
# get file name without extension
|
||||
only_name = path.splitext(file)[0]
|
||||
|
||||
# Adding the new name with extension
|
||||
new_base = only_name + '_new' + '.txt'
|
||||
# construct full file path
|
||||
new_name = path.join(folder, new_base)
|
||||
|
||||
# Renaming the file
|
||||
os.rename(old_name, new_name)
|
||||
|
||||
# verify the result
|
||||
res = os.listdir(folder)
|
||||
print(res)
|
||||
|
||||
def hash_pdfs(example_pages):
|
||||
pdf_paths = list(path.join(PDF_FOR_TESTING, pdf_name) for pdf_name in example_pages.keys())
|
||||
hashes = hash_pdf_files(paths=pdf_paths, verbose=0)
|
||||
example_pages = dict(zip(hashes, example_pages.values()))
|
||||
return example_pages
|
||||
|
||||
def main():
|
||||
examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
|
||||
examples_pages = hash_pdfs(examples_pages)
|
||||
collect_metadata(examples_pages, save=True)
|
||||
#split_pdf(examples_pages)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -25,7 +25,7 @@ def remove_included(rectangles):
|
||||
return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax
|
||||
|
||||
def is_not_included(rect, rectangles):
|
||||
return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
|
||||
return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)
|
||||
|
||||
rectangles = list(map(xywh_to_vec_rect, rectangles))
|
||||
rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
|
||||
|
||||
@ -47,8 +47,8 @@ def find_primary_text_regions(image):
|
||||
|
||||
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
||||
|
||||
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) #20,3
|
||||
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=2)
|
||||
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3
|
||||
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
|
||||
|
||||
#show_mpl(close)
|
||||
|
||||
|
||||
6
data/.gitignore
vendored
6
data/.gitignore
vendored
@ -1 +1,7 @@
|
||||
/test_pdf.pdf
|
||||
/pdfs_for_testing
|
||||
/figure_detection.png
|
||||
/layout_parsing.png
|
||||
/redaction_detection.png
|
||||
/table_parsing.png
|
||||
/pngs_for_testing
|
||||
|
||||
5
data/pdfs_for_testing.dvc
Normal file
5
data/pdfs_for_testing.dvc
Normal file
@ -0,0 +1,5 @@
|
||||
outs:
|
||||
- md5: bb0ce084f7ca54583972da71cb87e22c.dir
|
||||
size: 367181628
|
||||
nfiles: 28
|
||||
path: pdfs_for_testing
|
||||
5
data/pngs_for_testing.dvc
Normal file
5
data/pngs_for_testing.dvc
Normal file
@ -0,0 +1,5 @@
|
||||
outs:
|
||||
- md5: 4fed91116111b47edf1c6f6a67eb84d3.dir
|
||||
size: 58125058
|
||||
nfiles: 230
|
||||
path: pngs_for_testing
|
||||
@ -3,7 +3,7 @@ import argparse
|
||||
from cv_analysis.table_parsing import annotate_tables_in_pdf
|
||||
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
|
||||
from cv_analysis.layout_parsing import annotate_layout_in_pdf
|
||||
from cv_analysis.figure_detection import detect_figures_in_pdf
|
||||
from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files
|
||||
from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("pdf_path")
|
||||
parser.add_argument("page_index", type=int)
|
||||
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"])
|
||||
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figures"])
|
||||
parser.add_argument("--show", action="store_true", default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
@ -28,6 +28,6 @@ if __name__ == "__main__":
|
||||
elif args.type == "layout":
|
||||
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show)
|
||||
elif args.type == "figure":
|
||||
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show)
|
||||
elif args.type == "figure2":
|
||||
detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=args.show)
|
||||
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
|
||||
elif args.type == "figures":
|
||||
detect_figures_in_test_files()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user