diff --git a/.gitignore b/.gitignore index f4ebc2a..fb1e327 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ build_venv/ /data/metadata_testing_files.csv .coverage /data/ +/venv/ \ No newline at end of file diff --git a/cv_analysis/layout_parsing.py b/cv_analysis/layout_parsing.py index f5de783..0e4593b 100644 --- a/cv_analysis/layout_parsing.py +++ b/cv_analysis/layout_parsing.py @@ -1,4 +1,3 @@ -import itertools from itertools import compress from itertools import starmap from operator import __and__ @@ -6,17 +5,15 @@ from operator import __and__ import cv2 import numpy as np - from cv_analysis.utils.connect_rects import connect_related_rects2 -from cv_analysis.utils.structures import Rectangle from cv_analysis.utils.postprocessing import ( - remove_overlapping, remove_included, has_no_parent, ) -from cv_analysis.utils.visual_logging import vizlogger +from cv_analysis.utils.structures import Rectangle -#could be dynamic parameter is the scan is noisy + +# could be dynamic parameter is the scan is noisy def is_likely_segment(rect, min_area=100): return cv2.contourArea(rect, False) > min_area @@ -34,7 +31,7 @@ def find_segments(image): def dilate_page_components(image): - #if text is detected in words make kernel bigger + # if text is detected in words make kernel bigger image = cv2.GaussianBlur(image, (7, 7), 0) thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) @@ -49,7 +46,6 @@ def fill_in_component_area(image, rect): return ~image - def parse_layout(image: np.array): image = image.copy() image_ = image.copy() diff --git a/cv_analysis/redaction_detection.py b/cv_analysis/redaction_detection.py index 82e8c1f..ed7bd60 100644 --- a/cv_analysis/redaction_detection.py +++ b/cv_analysis/redaction_detection.py @@ -2,7 +2,8 @@ from functools import partial import cv2 import numpy as np -from iteration_utilities import starfilter, first +from iteration_utilities import first +from iteration_utilities._iteration_utilities import starfilter from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy from cv_analysis.utils.visual_logging import vizlogger diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 06a27ed..bb3105f 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -1,7 +1,3 @@ -from functools import partial -from itertools import chain, starmap -from operator import attrgetter - import cv2 import numpy as np from funcy import lmap @@ -127,10 +123,10 @@ def parse_tables(image: np.array, show=False): image = preprocess(image) image = isolate_vertical_and_horizontal_components(image) rects = turn_connected_components_into_rects(image) - #print(rects, "\n\n") + # print(rects, "\n\n") rects = list(map(Rectangle.from_xywh, rects)) - #print(rects, "\n\n") + # print(rects, "\n\n") rects = remove_isolated(rects) - #print(rects, "\n\n") - + # print(rects, "\n\n") + return rects diff --git a/scripts/annotate.py b/scripts/annotate.py index 43d7ff1..fa5273e 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -46,5 +46,6 @@ if __name__ == "__main__": from cv_analysis.layout_parsing import parse_layout as analyze elif args.type == "figure": from cv_analysis.figure_detection.figure_detection import detect_figures + analyze = detect_figures annotate_page(page, analyze, draw, name=name, show=args.show) diff --git a/scripts/annotate_pdf.py b/scripts/annotate_pdf.py index e65f0a1..5dcd120 100644 --- a/scripts/annotate_pdf.py +++ b/scripts/annotate_pdf.py @@ -1,6 +1,5 @@ import argparse import timeit -from time import process_time from itertools import starmap from pathlib import Path @@ -57,7 +56,6 @@ if __name__ == "__main__": t2 = timeit.default_timer() save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type) t3 = timeit.default_timer() - print("[s] opening file and convert pdf pages to images: ", t1-t0) - print("[s] analyse and annotate images: ", t2-t1) - print("[s] save images as pdf: ", t3-t2) - + print("[s] opening file and convert pdf pages to images: ", t1 - t0) + print("[s] analyse and annotate images: ", t2 - t1) + print("[s] save images as pdf: ", t3 - t2)