few corrections for including smaller figures

This commit is contained in:
llocarnini 2022-04-22 10:12:28 +02:00
parent 3669b6b341
commit 11a2465789
5 changed files with 53 additions and 44 deletions

View File

@ -1,41 +1,41 @@
from cv_analysis.layout_parsing import annotate_layout_in_pdf
from cv_analysis.figure_detection import figures_in_image, detect_figures
from cv_analysis.table_parsing import tables_in_image
from cv_analysis.figure_detection import detect_figures
from cv_analysis.table_parsing import tables_in_image, parse_table
from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.display import show_mpl
def detect_parting_line(image):
pass
def cut_out_content_structures(layout_rects, page):
large_enough_rects = []
too_small_rects = []
large_rects = []
small_rects = []
for x, y, w, h in layout_rects:
rect = (x, y, w, h)
if w * h >= 100000:
if w * h >= 50000:
cropped_page = page[y:(y + h), x:(x + w)]
large_enough_rects.append([rect, cropped_page])
large_rects.append([rect, cropped_page])
else:
cropped_page = page[y:(y + h), x:(x + w)]
too_small_rects.append([rect, cropped_page])
return large_enough_rects, too_small_rects
small_rects.append([rect, cropped_page])
return large_rects, small_rects
def parse_and_label_content_structures(page, large_enough_rects, too_small_rects):
for coordinates, cropped_image in large_enough_rects:
def parse_content_structures(page, large_rects, small_rects):
for coordinates, cropped_image in large_rects:
non_text_rects = detect_figures(cropped_image)
print(len(non_text_rects), len(list(non_text_rects)))
if len(non_text_rects) == 0:
page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
elif tables_in_image(cropped_image)[0]:
page = draw_rectangles(page, [coordinates], color=(255, 0, 0), annotate=True)
stats = parse_table(page)
page = draw_rectangles(page, stats, annotate=True)
else:
page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True)
# for coordinates, cropped_image in too_small_rects:
# for coordinates, cropped_image in small_rects:
# non_text_rects = detect_figures(cropped_image)
# if len(non_text_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0:
# page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
@ -44,16 +44,13 @@ def parse_and_label_content_structures(page, large_enough_rects, too_small_rects
return page
def detect_figures_over_layout():
# pdf_path = "/home/lillian/PycharmProjects/ner_address/data/pdfs/syngenta/026c917f04660aaea4bb57d180f9598b.pdf"
# pdf_path = "/home/lillian/ocr_docs/ocr1.pdf"
pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
#pdf_path = "/home/lillian/ocr_docs/VV-857853.pdf"
page_index = 13
def detect_figures_with_layout_parsing(pdf_path, page_index=1, show=False):
layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
big_structures, small_structures = cut_out_content_structures(layout_rects, page)
page = parse_and_label_content_structures(page, big_structures, small_structures)
show_mpl(page)
page = parse_content_structures(page, big_structures, small_structures)
if show:
show_mpl(page)
else:
return page
detect_figures_over_layout()

View File

@ -1,6 +1,7 @@
import cv2
import numpy as np
from pdf2image import pdf2image
from PIL import Image
from cv_analysis.utils.detection import detect_large_coherent_structures
from cv_analysis.utils.display import show_mpl
@ -16,9 +17,7 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
def detect_figures(image: np.array):
image = image.copy()
#show_mpl(image)
image = remove_primary_text_regions(image)
#show_mpl(image)
cnts = detect_large_coherent_structures(image)
cnts = filter(is_likely_figure, cnts)
@ -41,10 +40,18 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
return page
def figures_in_image(cropped_page):
redaction_contours = detect_figures(cropped_page)
if len(redaction_contours) > 0:
return True
else:
return False
# pages = []
# for i in range(0,16):
# pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
# page_index = i
# page = detect_figures_in_pdf(pdf_path, page_index, show=False)
# pages.append(Image.fromarray(page))
# p1, p = pages[0], pages[1:]
#
# out_pdf_path = "/home/lillian/ocr_docs/out.pdf"
#
# p1.save(
# out_pdf_path, "PDF", resolution=150.0, save_all=True, append_images=p
# )

View File

@ -156,10 +156,11 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
else:
return page
def tables_in_image(cropped_image):
table_rects = parse_table(cropped_image)
if len(table_rects)>0:
if len(table_rects) > 0:
return True, table_rects
else:
return False, None

View File

@ -1,4 +1,6 @@
import cv2
import numpy as np
from cv_analysis.utils.display import show_mpl
def remove_primary_text_regions(image):
@ -14,12 +16,9 @@ def remove_primary_text_regions(image):
image = image.copy()
cnts = find_primary_text_regions(image)
for cnt in cnts:
x, y, w, h = cv2.boundingRect(cnt)
print(x,y,w,h, w*h, w/h)
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
return image
@ -38,7 +37,6 @@ def find_primary_text_regions(image):
def is_likely_primary_text_segments(cnt):
x,y,w,h = cv2.boundingRect(cnt)
print(cv2.contourArea(cnt))
return 800 < cv2.contourArea(cnt) < 16000 or w/h > 3
image = image.copy()
@ -48,14 +46,17 @@ def find_primary_text_regions(image):
image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7))
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) #20,3
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
show_mpl(close)
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 5))
#show_mpl(close)
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(7, 4)) #5,3
dilate = cv2.dilate(close, dilate_kernel, iterations=1)
show_mpl(dilate)
#show_mpl(dilate)
cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
cnts = filter(is_likely_primary_text_segments, cnts)
return cnts

View File

@ -4,13 +4,14 @@ from cv_analysis.table_parsing import annotate_tables_in_pdf
from cv_analysis.redaction_detection import annotate_redactions_in_pdf
from cv_analysis.layout_parsing import annotate_layout_in_pdf
from cv_analysis.figure_detection import detect_figures_in_pdf
from cv_analysis.fig_detection_with_layout import detect_figures_with_layout_parsing
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("page_index", type=int)
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure"])
parser.add_argument("--type", choices=["table", "redaction", "layout", "figure", "figure2"])
args = parser.parse_args()
@ -27,3 +28,5 @@ if __name__ == "__main__":
annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
elif args.type == "figure":
detect_figures_in_pdf(args.pdf_path, page_index=args.page_index, show=True)
elif args.type == "figure2":
detect_figures_with_layout_parsing(args.pdf_path, page_index=args.page_index, show=True)