From 3669b6b341361738437aecc46835ee73ac9b53f1 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Wed, 20 Apr 2022 09:43:30 +0200
Subject: [PATCH] fig_detection_with_layout.py: approach to label the content
 of a page through layout detection, table parsing for detected tables needs
 to be added and overall codes needs to be reviewed layout_parsing.py added
 condition so fig_detection_with_layout.py works table_parsing.py uncommented
 line for better table parsing text.py changed kernel sizes

---
 cv_analysis/fig_detection_with_layout.py | 59 ++++++++++++++++++++++++
 cv_analysis/figure_detection.py          | 15 ++++--
 cv_analysis/layout_parsing.py            |  9 ++--
 cv_analysis/table_parsing.py             | 11 ++++-
 cv_analysis/utils/text.py                | 16 ++++---
 5 files changed, 96 insertions(+), 14 deletions(-)
 create mode 100644 cv_analysis/fig_detection_with_layout.py

diff --git a/cv_analysis/fig_detection_with_layout.py b/cv_analysis/fig_detection_with_layout.py
new file mode 100644
index 0000000..bd84789
--- /dev/null
+++ b/cv_analysis/fig_detection_with_layout.py
@@ -0,0 +1,59 @@
+from cv_analysis.layout_parsing import annotate_layout_in_pdf
+from cv_analysis.figure_detection import figures_in_image, detect_figures
+from cv_analysis.table_parsing import tables_in_image
+from cv_analysis.utils.text import find_primary_text_regions, remove_primary_text_regions
+from cv_analysis.utils.draw import draw_rectangles
+from cv_analysis.utils.display import show_mpl
+
+
+def detect_parting_line(image):
+    pass
+
+
+def cut_out_content_structures(layout_rects, page):
+    large_enough_rects = []
+    too_small_rects = []
+    for x, y, w, h in layout_rects:
+        rect = (x, y, w, h)
+        if w * h >= 100000:
+            cropped_page = page[y:(y + h), x:(x + w)]
+            large_enough_rects.append([rect, cropped_page])
+        else:
+            cropped_page = page[y:(y + h), x:(x + w)]
+            too_small_rects.append([rect, cropped_page])
+    return large_enough_rects, too_small_rects
+
+
+def parse_and_label_content_structures(page, large_enough_rects, too_small_rects):
+    for coordinates, cropped_image in large_enough_rects:
+        non_text_rects = detect_figures(cropped_image)
+        print(len(non_text_rects), len(list(non_text_rects)))
+        if len(non_text_rects) == 0:
+            page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
+        elif tables_in_image(cropped_image)[0]:
+            page = draw_rectangles(page, [coordinates], color=(255, 0, 0), annotate=True)
+        else:
+            page = draw_rectangles(page, [coordinates], color=(0, 0, 255), annotate=True)
+
+    # for coordinates, cropped_image in too_small_rects:
+    #     non_text_rects = detect_figures(cropped_image)
+    #     if len(non_text_rects) == 0 and len(list(find_primary_text_regions(cropped_image))) > 0:
+    #         page = draw_rectangles(page, [coordinates], color=(0, 255, 0), annotate=True)
+    #     else:
+    #         page = draw_rectangles(page, [coordinates], color=(0, 255, 255), annotate=True)
+    return page
+
+
+def detect_figures_over_layout():
+    # pdf_path = "/home/lillian/PycharmProjects/ner_address/data/pdfs/syngenta/026c917f04660aaea4bb57d180f9598b.pdf"
+    # pdf_path = "/home/lillian/ocr_docs/ocr1.pdf"
+    pdf_path = "/home/lillian/ocr_docs/Report on spectra.pdf"
+    #pdf_path = "/home/lillian/ocr_docs/VV-857853.pdf"
+    page_index = 13
+    layout_rects, page = annotate_layout_in_pdf(pdf_path, page_index, return_rects=True)
+    big_structures, small_structures = cut_out_content_structures(layout_rects, page)
+    page = parse_and_label_content_structures(page, big_structures, small_structures)
+    show_mpl(page)
+
+
+detect_figures_over_layout()
diff --git a/cv_analysis/figure_detection.py b/cv_analysis/figure_detection.py
index 3827536..4c44a57 100644
--- a/cv_analysis/figure_detection.py
+++ b/cv_analysis/figure_detection.py
@@ -15,11 +15,10 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
 
 
 def detect_figures(image: np.array):
-
     image = image.copy()
-
+    #show_mpl(image)
     image = remove_primary_text_regions(image)
-    show_mpl(image)
+    #show_mpl(image)
     cnts = detect_large_coherent_structures(image)
 
     cnts = filter(is_likely_figure, cnts)
@@ -30,7 +29,6 @@ def detect_figures(image: np.array):
 
 
 def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
-
     page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
     page = np.array(page)
 
@@ -41,3 +39,12 @@ def detect_figures_in_pdf(pdf_path, page_index=1, show=False):
         show_mpl(page)
     else:
         return page
+
+
+def figures_in_image(cropped_page):
+    redaction_contours = detect_figures(cropped_page)
+
+    if len(redaction_contours) > 0:
+        return True
+    else:
+        return False
diff --git a/cv_analysis/layout_parsing.py b/cv_analysis/layout_parsing.py
index ae5559f..19d56a1 100644
--- a/cv_analysis/layout_parsing.py
+++ b/cv_analysis/layout_parsing.py
@@ -63,15 +63,18 @@ def parse_layout(image: np.array):
     return list(rects)
 
 
-def annotate_layout_in_pdf(pdf_path, page_index=1, show=False):
+def annotate_layout_in_pdf(pdf_path, page_index=1, return_rects=False, show=False):
 
     page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
     page = np.array(page)
 
     rects = parse_layout(page)
-    page = draw_rectangles(page, rects)
 
-    if show:
+    if return_rects:
+        return rects, page
+    elif show:
+        page = draw_rectangles(page, rects)
         show_mpl(page)
     else:
+        page = draw_rectangles(page, rects)
         return page
diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py
index 404b7ed..83dd2e5 100644
--- a/cv_analysis/table_parsing.py
+++ b/cv_analysis/table_parsing.py
@@ -10,13 +10,14 @@ from cv_analysis.utils.display import show_mpl
 from cv_analysis.utils.draw import draw_rectangles
 from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
 from cv_analysis.utils.deskew import deskew_histbased
+from cv_analysis.utils.filters import is_large_enough
 from cv_analysis.layout_parsing import parse_layout
 
 
 def add_external_contours(image, img):
     contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
 
-    # contours = filter(partial(is_large_enough, min_area=5000000), contours)
+    contours = filter(partial(is_large_enough, min_area=5000000), contours)
 
     for cnt in contours:
         x, y, w, h = cv2.boundingRect(cnt)
@@ -154,3 +155,11 @@ def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
         show_mpl(page)
     else:
         return page
+
+def tables_in_image(cropped_image):
+    table_rects = parse_table(cropped_image)
+
+    if len(table_rects)>0:
+        return True, table_rects
+    else:
+        return False, None
diff --git a/cv_analysis/utils/text.py b/cv_analysis/utils/text.py
index 7ce6d7f..31f3d2c 100644
--- a/cv_analysis/utils/text.py
+++ b/cv_analysis/utils/text.py
@@ -1,5 +1,5 @@
 import cv2
-
+from cv_analysis.utils.display import show_mpl
 
 def remove_primary_text_regions(image):
     """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
@@ -17,6 +17,7 @@ def remove_primary_text_regions(image):
 
     for cnt in cnts:
         x, y, w, h = cv2.boundingRect(cnt)
+        print(x,y,w,h, w*h, w/h)
         cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
 
     return image
@@ -36,7 +37,9 @@ def find_primary_text_regions(image):
     """
 
     def is_likely_primary_text_segments(cnt):
-        return 700 < cv2.contourArea(cnt) < 16000
+        x,y,w,h = cv2.boundingRect(cnt)
+        print(cv2.contourArea(cnt))
+        return 800 < cv2.contourArea(cnt) < 16000 or w/h > 3
 
     image = image.copy()
 
@@ -45,13 +48,14 @@ def find_primary_text_regions(image):
 
     image = cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
 
-    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7))
     close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
-    # show_mpl(close)
-    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
+    show_mpl(close)
+    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 5))
     dilate = cv2.dilate(close, dilate_kernel, iterations=1)
-    # show_mpl(dilate)
+    show_mpl(dilate)
     cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
     cnts = filter(is_likely_primary_text_segments, cnts)
 
     return cnts
+