minor changes, refactoring and testfiles added

This commit is contained in:
llocarnini 2022-05-17 09:17:24 +02:00
parent 0e30e97f80
commit 179ad20165
12 changed files with 204 additions and 39 deletions

3
.gitignore vendored
View File

@ -13,7 +13,6 @@ build_venv/
/.idea/table_parsing.iml
/.idea/vcs.xml
/results/
/data
/table_parsing.egg-info
/target/
/tests/
@ -22,3 +21,5 @@ build_venv/
/cv_analysis.egg-info/SOURCES.txt
/cv_analysis.egg-info/top_level.txt
/.vscode/
/cv_analysis/test/test_data/example_pages.json
/data/metadata_testing_files.csv

View File

@ -23,5 +23,5 @@ deskew:
test_dummy: test_dummy
visual_logging:
level: $LOGGING_LEVEL_ROOT|INFO
level: $LOGGING_LEVEL_ROOT|DEBUG
output_folder: /tmp/debug/

View File

@ -55,18 +55,4 @@ def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False):
else:
return page
# pages = []
# for i in range(0,16):
# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
# page_index = i
# layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
# big_structures, small_structures = cut_out_content_structures(layout_rects, page)
# page = parse_content_structures(page, big_structures, small_structures)
# pages.append(Image.fromarray(page))
# p1, p = pages[0], pages[1:]
#
# out_pdf_path = "/home/lillian/ocr_docs/out1.pdf"
#
# p1.save(
# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
# )

View File

@ -1,6 +1,9 @@
import cv2
import numpy as np
from pdf2image import pdf2image
import pandas as pd
from PIL import Image
import timeit
from cv_analysis.utils.detection import detect_large_coherent_structures
from cv_analysis.utils.display import show_mpl
@ -33,7 +36,7 @@ def detect_figures(image: np.array):
def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = pdf2image.convert_from_path(pdf_path, dpi=300, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
redaction_contours = detect_figures(page)
@ -43,16 +46,56 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
show_mpl(page)
return page
# pages = []
# for i in range(0,16):
# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
# page_index = i
# page = detect_figures_in_pdf(pdf_path,page_index)
# pages.append(Image.fromarray(page))
# p1, p = pages[0], pages[1:]
#
# out_pdf_path = "/home/lillian/ocr_docs/out.pdf"
#
# p1.save(
# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
# )
def detect_figures_in_test_files():
def save_as_pdf(pages):
p1, p = pages[0], pages[1:]
out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_pdf.pdf"
p1.save(
out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
)
path = "/home/lillian/ocr_docs/"
ex_pages = pd.read_csv(path+"/metadata/metadata2.csv")
pages_detected = []
t0 = timeit.default_timer()
for name, page_nr in zip(ex_pages.pdf_name, ex_pages.page):
page = pdf2image.convert_from_path(path + "/original/" + name, dpi=300, first_page=page_nr, last_page=page_nr)[0]
page = np.array(page)
redaction_contours = detect_figures(page)
page = draw_rectangles(page, redaction_contours)
pages_detected.append(Image.fromarray(page))
print(timeit.default_timer()-t0)
save_as_pdf(pages_detected)
def detect_figures_in_png(pdf_path, show=False):
page = Image.open(pdf_path)
page = np.array(page)
redaction_contours = detect_figures(page)
page = draw_rectangles(page, redaction_contours)
vizlogger.debug(page, "figures03_final.png")
if show:
show_mpl(page)
return page
def detect_figures_in_test_files_png():
file_name = pd.read_csv("/home/lillian/ocr_docs/metadata/metadata2.csv")
path = "/home/lillian/ocr_docs/png_example_pages/"
pages = []
page_index = 0
t0 = timeit.default_timer()
for name in file_name.image_name:
page = detect_figures_in_png(path+name+".png", page_index, show=False)
pages.append(Image.fromarray(page))
t1 = timeit.default_timer()
print(t1-t0)
p1, p = pages[0], pages[1:]
out_pdf_path = "/home/lillian/ocr_docs/output_files/fig_detection_png2.pdf"
p1.save(
out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
)

View File

@ -10,6 +10,9 @@ CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
LOG_FILE = "/tmp/log.log"
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
HASHED_PDFS = path.join(PDF_FOR_TESTING, "hashed")
TEST_DIR = path.join(MODULE_DIR, "test")
TEST_DATA_DIR = path.join(MODULE_DIR, "test", "test_data")

View File

@ -0,0 +1,116 @@
import os
from os import path
import pandas as pd
from pdf2image import convert_from_path
from itertools import chain
import json
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS
from cv_analysis.utils.deduplicate_pdfs import hash_pdf_files
def read_json(path):
with open(path, encoding='utf-8') as file:
data = json.load(file)
return data
# def collect_metadata(example_pages, save=False):
# metadata = []
# i = 0
# for name, document_sections in example_pages.items():
# for pages in document_sections:
# span = list(range(pages[0], pages[1] + 1))
# for page_nr in span:
# metadata.append(["fig_table" + str(i), name, page_nr])
# i += 1
# if save:
# df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
# df.to_csv("/exported_files/test_pages.csv")
# else:
# return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
def collect_metadata(example_pages, save=False):
metadata = []
make_metadata_entry = make_metadata_entry_maker()
for name, document_sections in example_pages.items():
metadata.append(f(name, document_sections, make_metadata_entry))
metadata = list(chain.from_iterable(metadata))
if save:
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv"))
else:
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
def f(name, document_sections, make_metadata_entry):
for pages in document_sections:
span = list(range(pages[0], pages[1] + 1))
for page_nr in span:
yield make_metadata_entry(name, page_nr)
def make_metadata_entry_maker():
i = -1
def make_metadata_entry(name, page_nr):
nonlocal i
i += 1
return ["fig_table" + str(i), name, page_nr]
return make_metadata_entry
def split_pdf(example_pages):
dir_path = PDF_FOR_TESTING
i = 0
for name, document_sections in example_pages.items():
for pages in document_sections:
images = convert_from_path(pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0],
last_page=pages[1])
for image in images:
fp = path.join(PNG_FOR_TESTING, "fig_table" + str(i) + ".png")
image.save(fp=fp, dpi=(300, 300))
i += 1
def rename_files_with_hash(example_pages,hashes):
files_to_rename = list(example_pages.keys())
folder = HASHED_PDFS
# Iterate through the folder
for file in os.listdir(folder):
# Checking if the file is present in the list
if file in files_to_rename:
# construct current name using file name and path
old_name = path.join(folder, file)
# get file name without extension
only_name = path.splitext(file)[0]
# Adding the new name with extension
new_base = only_name + '_new' + '.txt'
# construct full file path
new_name = path.join(folder, new_base)
# Renaming the file
os.rename(old_name, new_name)
# verify the result
res = os.listdir(folder)
print(res)
def hash_pdfs(example_pages):
pdf_paths = list(path.join(PDF_FOR_TESTING, pdf_name) for pdf_name in example_pages.keys())
hashes = hash_pdf_files(paths=pdf_paths, verbose=0)
example_pages = dict(zip(hashes, example_pages.values()))
return example_pages
def main():
examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
examples_pages = hash_pdfs(examples_pages)
collect_metadata(examples_pages, save=True)
#split_pdf(examples_pages)
if __name__ == "__main__":
main()

View File

@ -25,7 +25,7 @@ def remove_included(rectangles):
return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax
def is_not_included(rect, rectangles):
return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)
rectangles = list(map(xywh_to_vec_rect, rectangles))
rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)

View File

@ -47,8 +47,8 @@ def find_primary_text_regions(image):
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) #20,3
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=2)
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
#show_mpl(close)

6
data/.gitignore vendored
View File

@ -1 +1,7 @@
/test_pdf.pdf
/pdfs_for_testing
/figure_detection.png
/layout_parsing.png
/redaction_detection.png
/table_parsing.png
/pngs_for_testing

View File

@ -0,0 +1,5 @@
outs:
- md5: bb0ce084f7ca54583972da71cb87e22c.dir
size: 367181628
nfiles: 28
path: pdfs_for_testing

View File

@ -0,0 +1,5 @@
outs:
- md5: 4fed91116111b47edf1c6f6a67eb84d3.dir
size: 58125058
nfiles: 230
path: pngs_for_testing

View File

@ -3,7 +3,7 @@ import argparse
from cv_analysis.table_parsing import annotate_tables_in_pdf
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
from cv_analysis.layout_parsing import annotate_layout_in_pdf
from cv_analysis.figure_detection import detect_figures_in_pdf
from cv_analysis.figure_detection import detect_figures_in_pdf, detect_figures_in_test_files
from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing
@ -11,7 +11,7 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("page_index", type=int)
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"])
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figures"])
parser.add_argument("--show", action="store_true", default=False)
args = parser.parse_args()
@ -28,6 +28,6 @@ if __name__ == "__main__":
elif args.type == "layout":
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show)
elif args.type == "figure":
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=args.show)
elif args.type == "figure2":
detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=args.show)
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
elif args.type == "figures":
detect_figures_in_test_files()