few corrections for including smaller figures
This commit is contained in:
parent
3669b6b341
commit
11a2465789
@ -1,41 +1,41 @@
|
|||||||
from cv_analysis.layout_parsing import annotate_layout_in_pdf
|
from cv_analysis.layout_parsing import annotate_layout_in_pdf
|
||||||
from cv_analysis.figure_detection import figures_in_image, detect_figures
|
from cv_analysis.figure_detection import detect_figures
|
||||||
from cv_analysis.table_parsing import tables_in_image
|
from cv_analysis.table_parsing import tables_in_image, parse_table
|
||||||
from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions
|
from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions
|
||||||
from cv_analysis.utils.draw import draw_rectangles
|
from cv_analysis.utils.draw import draw_rectangles
|
||||||
from cv_analysis.utils.display import show_mpl
|
from cv_analysis.utils.display import show_mpl
|
||||||
|
|
||||||
|
|
||||||
def detect_parting_line(image):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def cut_out_content_structures(layout_rects, page):
|
def cut_out_content_structures(layout_rects, page):
|
||||||
large_enough_rects = []
|
large_rects = []
|
||||||
too_small_rects = []
|
small_rects = []
|
||||||
for x, y, w, h in layout_rects:
|
for x, y, w, h in layout_rects:
|
||||||
rect = (x, y, w, h)
|
rect = (x, y, w, h)
|
||||||
if w * h >= 100000:
|
if w * h >= 50000:
|
||||||
cropped_page = page[y:(y + h), x:(x + w)]
|
cropped_page = page[y:(y + h), x:(x + w)]
|
||||||
large_enough_rects.append([rect, cropped_page])
|
large_rects.append([rect, cropped_page])
|
||||||
else:
|
else:
|
||||||
cropped_page = page[y:(y + h), x:(x + w)]
|
cropped_page = page[y:(y + h), x:(x + w)]
|
||||||
too_small_rects.append([rect, cropped_page])
|
small_rects.append([rect, cropped_page])
|
||||||
return large_enough_rects, too_small_rects
|
return large_rects, small_rects
|
||||||
|
|
||||||
|
|
||||||
def parse_and_label_content_structures(page, large_enough_rects, too_small_rects):
|
def parse_content_structures(page, large_rects, small_rects):
|
||||||
for coordinates, cropped_image in large_enough_rects:
|
for coordinates, cropped_image in large_rects:
|
||||||
non_text_rects = detect_figures(cropped_image)
|
non_text_rects = detect_figures(cropped_image)
|
||||||
print(len(non_text_rects), len(list(non_text_rects)))
|
|
||||||
if len(non_text_rects) == 0:
|
if len(non_text_rects) == 0:
|
||||||
page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
|
page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
|
||||||
|
|
||||||
elif tables_in_image(cropped_image)[0]:
|
elif tables_in_image(cropped_image)[0]:
|
||||||
page = draw_rectangles(page, [coordinates], color=(255, 0, 0), annotate=True)
|
page = draw_rectangles(page, [coordinates], color=(255, 0, 0), annotate=True)
|
||||||
|
stats = parse_table(page)
|
||||||
|
page = draw_rectangles(page, stats, annotate=True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True)
|
page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True)
|
||||||
|
|
||||||
# for coordinates, cropped_image in too_small_rects:
|
# for coordinates, cropped_image in small_rects:
|
||||||
# non_text_rects = detect_figures(cropped_image)
|
# non_text_rects = detect_figures(cropped_image)
|
||||||
# if len(non_text_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0:
|
# if len(non_text_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0:
|
||||||
# page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
|
# page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
|
||||||
@ -44,16 +44,13 @@ def parse_and_label_content_structures(page, large_enough_rects, too_small_rects
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def detect_figures_over_layout():
|
def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False):
|
||||||
# pdf_path = "/home/lillian/PycharmProjects/ner_address/data/pdfs/syngenta/026c917f04660aaea4bb57d180f9598b.pdf"
|
|
||||||
# pdf_path = "/home/lillian/ocr_docs/ocr1.pdf"
|
|
||||||
pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
|
|
||||||
#pdf_path = "/home/lillian/ocr_docs/VV-857853.pdf"
|
|
||||||
page_index = 13
|
|
||||||
layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
|
layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
|
||||||
big_structures, small_structures = cut_out_content_structures(layout_rects, page)
|
big_structures, small_structures = cut_out_content_structures(layout_rects, page)
|
||||||
page = parse_and_label_content_structures(page, big_structures, small_structures)
|
page = parse_content_structures(page, big_structures, small_structures)
|
||||||
|
|
||||||
|
if show:
|
||||||
show_mpl(page)
|
show_mpl(page)
|
||||||
|
else:
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
detect_figures_over_layout()
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pdf2image import pdf2image
|
from pdf2image import pdf2image
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from cv_analysis.utils.detection import detect_large_coherent_structures
|
from cv_analysis.utils.detection import detect_large_coherent_structures
|
||||||
from cv_analysis.utils.display import show_mpl
|
from cv_analysis.utils.display import show_mpl
|
||||||
@ -16,9 +17,7 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
|
|||||||
|
|
||||||
def detect_figures(image: np.array):
|
def detect_figures(image: np.array):
|
||||||
image = image.copy()
|
image = image.copy()
|
||||||
#show_mpl(image)
|
|
||||||
image = remove_primary_text_regions(image)
|
image = remove_primary_text_regions(image)
|
||||||
#show_mpl(image)
|
|
||||||
cnts = detect_large_coherent_structures(image)
|
cnts = detect_large_coherent_structures(image)
|
||||||
|
|
||||||
cnts = filter(is_likely_figure, cnts)
|
cnts = filter(is_likely_figure, cnts)
|
||||||
@ -41,10 +40,18 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def figures_in_image(cropped_page):
|
|
||||||
redaction_contours = detect_figures(cropped_page)
|
|
||||||
|
|
||||||
if len(redaction_contours) > 0:
|
# pages = []
|
||||||
return True
|
# for i in range(0,16):
|
||||||
else:
|
# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
|
||||||
return False
|
# page_index = i
|
||||||
|
# page = detect_figures_in_pdf(pdf_path, page_index, show=False)
|
||||||
|
# pages.append(Image.fromarray(page))
|
||||||
|
# p1, p = pages[0], pages[1:]
|
||||||
|
#
|
||||||
|
# out_pdf_path = "/home/lillian/ocr_docs/out.pdf"
|
||||||
|
#
|
||||||
|
# p1.save(
|
||||||
|
# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
|
||||||
|
# )
|
||||||
|
|
||||||
|
|||||||
@ -156,10 +156,11 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
|
|||||||
else:
|
else:
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def tables_in_image(cropped_image):
|
def tables_in_image(cropped_image):
|
||||||
table_rects = parse_table(cropped_image)
|
table_rects = parse_table(cropped_image)
|
||||||
|
|
||||||
if len(table_rects)>0:
|
if len(table_rects) > 0:
|
||||||
return True, table_rects
|
return True, table_rects
|
||||||
else:
|
else:
|
||||||
return False, None
|
return False, None
|
||||||
|
|||||||
@ -1,4 +1,6 @@
|
|||||||
import cv2
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from cv_analysis.utils.display import show_mpl
|
from cv_analysis.utils.display import show_mpl
|
||||||
|
|
||||||
def remove_primary_text_regions(image):
|
def remove_primary_text_regions(image):
|
||||||
@ -14,12 +16,9 @@ def remove_primary_text_regions(image):
|
|||||||
image = image.copy()
|
image = image.copy()
|
||||||
|
|
||||||
cnts = find_primary_text_regions(image)
|
cnts = find_primary_text_regions(image)
|
||||||
|
|
||||||
for cnt in cnts:
|
for cnt in cnts:
|
||||||
x, y, w, h = cv2.boundingRect(cnt)
|
x, y, w, h = cv2.boundingRect(cnt)
|
||||||
print(x,y,w,h, w*h, w/h)
|
|
||||||
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
|
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
|
||||||
|
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
|
||||||
@ -38,7 +37,6 @@ def find_primary_text_regions(image):
|
|||||||
|
|
||||||
def is_likely_primary_text_segments(cnt):
|
def is_likely_primary_text_segments(cnt):
|
||||||
x,y,w,h = cv2.boundingRect(cnt)
|
x,y,w,h = cv2.boundingRect(cnt)
|
||||||
print(cv2.contourArea(cnt))
|
|
||||||
return 800 < cv2.contourArea(cnt) < 16000 or w/h > 3
|
return 800 < cv2.contourArea(cnt) < 16000 or w/h > 3
|
||||||
|
|
||||||
image = image.copy()
|
image = image.copy()
|
||||||
@ -48,14 +46,17 @@ def find_primary_text_regions(image):
|
|||||||
|
|
||||||
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
||||||
|
|
||||||
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7))
|
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3
|
||||||
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
|
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
|
||||||
show_mpl(close)
|
|
||||||
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 5))
|
#show_mpl(close)
|
||||||
|
|
||||||
|
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(7, 4)) #5,3
|
||||||
dilate = cv2.dilate(close, dilate_kernel, iterations=1)
|
dilate = cv2.dilate(close, dilate_kernel, iterations=1)
|
||||||
show_mpl(dilate)
|
|
||||||
|
#show_mpl(dilate)
|
||||||
|
|
||||||
cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||||
cnts = filter(is_likely_primary_text_segments, cnts)
|
cnts = filter(is_likely_primary_text_segments, cnts)
|
||||||
|
|
||||||
return cnts
|
return cnts
|
||||||
|
|
||||||
|
|||||||
@ -4,13 +4,14 @@ from cv_analysis.table_parsing import annotate_tables_in_pdf
|
|||||||
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
|
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
|
||||||
from cv_analysis.layout_parsing import annotate_layout_in_pdf
|
from cv_analysis.layout_parsing import annotate_layout_in_pdf
|
||||||
from cv_analysis.figure_detection import detect_figures_in_pdf
|
from cv_analysis.figure_detection import detect_figures_in_pdf
|
||||||
|
from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("pdf_path")
|
parser.add_argument("pdf_path")
|
||||||
parser.add_argument("page_index", type=int)
|
parser.add_argument("page_index", type=int)
|
||||||
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure"])
|
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -27,3 +28,5 @@ if __name__ == "__main__":
|
|||||||
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
|
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
|
||||||
elif args.type == "figure":
|
elif args.type == "figure":
|
||||||
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
|
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
|
||||||
|
elif args.type == "figure2":
|
||||||
|
detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=True)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user