From d555e86475e82024f8e1a5fc5b0ac70faa091ee1 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:24:04 +0100
Subject: [PATCH 01/27] refactored figure detection once

---
 vidocp/figure_detection.py | 49 +++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index e852646..af68835 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -18,11 +18,29 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
     return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio)
 
 
-def detect_figures(image: np.array):
+def remove_primary_text_regions(image):
+    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
+
+    Args:
+        image: Image to remove primary text from.
+
+    Returns:
+        Image with primary text removed.
+
+    References:
+        https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
+    """
+
+    def filter_likely_primary_text_segments(cnts):
+        for c in cnts:
+            area = cv2.contourArea(c)
+            if area > 800 and area < 15000:
+                yield cv2.boundingRect(c)
 
     image = image.copy()
 
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
     thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
 
     close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
@@ -33,16 +51,19 @@ def detect_figures(image: np.array):
 
     cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
 
-    def filter_rects():
-        for c in cnts:
-            area = cv2.contourArea(c)
-            if area > 800 and area < 15000:
-                yield cv2.boundingRect(c)
-
-    for rect in filter_rects():
+    for rect in filter_likely_primary_text_segments(cnts):
         x, y, w, h = rect
         cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
 
+    return image
+
+
+def __detect_large_coherent_structures(image: np.array):
+    """Detects large coherent structures on an image.
+   
+    References:
+         https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
+    """
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
     thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
@@ -55,8 +76,18 @@ def detect_figures(image: np.array):
 
     cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
+    return cnts
+
+
+def detect_figures(image: np.array):
+
+    image = image.copy()
+
+    image = remove_primary_text_regions(image)
+    cnts = __detect_large_coherent_structures(image)
+
     cnts = filter(is_likely_figure, cnts)
-    rects = [cv2.boundingRect(c) for c in cnts]
+    rects = map(cv2.boundingRect, cnts)
     rects = remove_included(rects)
 
     return rects

From c9780a57e5a048529d36958ba678eddb11759cef Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:24:41 +0100
Subject: [PATCH 02/27] removed obsolete import

---
 vidocp/figure_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index af68835..46e5484 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -2,7 +2,7 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import draw_contours, show_mpl, draw_rectangles, remove_included, remove_overlapping, show_cv2
+from vidocp.utils import show_mpl, draw_rectangles, remove_included
 
 
 def is_large_enough(cont, min_area=10000):

From 504cafbd5d4bba183d9943b36c60548aae34e402 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:25:44 +0100
Subject: [PATCH 03/27] renaming

---
 scripts/annotate.py        | 4 ++--
 vidocp/figure_detection.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/annotate.py b/scripts/annotate.py
index 10d40cc..682b8ad 100644
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@@ -3,7 +3,7 @@ import argparse
 from vidocp.table_parsing import annotate_tables_in_pdf
 from vidocp.redaction_detection import annotate_boxes_in_pdf
 from vidocp.layout_parsing import annotate_layout_in_pdf
-from vidocp.figure_detection import remove_text_in_pdf
+from vidocp.figure_detection import detect_figures_in_pdf
 
 
 def parse_args():
@@ -26,4 +26,4 @@ if __name__ == "__main__":
     elif args.type == "layout":
         annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index)
     elif args.type == "figure":
-        remove_text_in_pdf(args.pdf_path, page_index=args.page_index)
+        detect_figures_in_pdf(args.pdf_path, page_index=args.page_index)
diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index 46e5484..f063f1a 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -93,7 +93,7 @@ def detect_figures(image: np.array):
     return rects
 
 
-def remove_text_in_pdf(pdf_path, page_index=1):
+def detect_figures_in_pdf(pdf_path, page_index=1):
 
     page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
     page = np.array(page)

From fed3a7e4f1b8b7ca4e14f9e495459c26490fb50b Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:26:16 +0100
Subject: [PATCH 04/27] refactoring

---
 vidocp/figure_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index f063f1a..b5fd38b 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -34,7 +34,7 @@ def remove_primary_text_regions(image):
     def filter_likely_primary_text_segments(cnts):
         for c in cnts:
             area = cv2.contourArea(c)
-            if area > 800 and area < 15000:
+            if 800 < area < 15000:
                 yield cv2.boundingRect(c)
 
     image = image.copy()

From 98d77cb522a08821c3a13ae2cffbe7239c654762 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:27:55 +0100
Subject: [PATCH 05/27] refactoring

---
 vidocp/figure_detection.py | 64 ++------------------------------------
 vidocp/utils.py            | 62 ++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index b5fd38b..830fc97 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -2,7 +2,8 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import show_mpl, draw_rectangles, remove_included
+from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \
+    __detect_large_coherent_structures
 
 
 def is_large_enough(cont, min_area=10000):
@@ -18,67 +19,6 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
     return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio)
 
 
-def remove_primary_text_regions(image):
-    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
-
-    Args:
-        image: Image to remove primary text from.
-
-    Returns:
-        Image with primary text removed.
-
-    References:
-        https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
-    """
-
-    def filter_likely_primary_text_segments(cnts):
-        for c in cnts:
-            area = cv2.contourArea(c)
-            if 800 < area < 15000:
-                yield cv2.boundingRect(c)
-
-    image = image.copy()
-
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-
-    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
-    close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
-
-    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
-    dilate = cv2.dilate(close, dilate_kernel, iterations=1)
-
-    cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
-
-    for rect in filter_likely_primary_text_segments(cnts):
-        x, y, w, h = rect
-        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
-
-    return image
-
-
-def __detect_large_coherent_structures(image: np.array):
-    """Detects large coherent structures on an image.
-   
-    References:
-         https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
-    """
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
-
-    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
-    dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
-
-    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
-    close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
-
-    cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
-    return cnts
-
-
 def detect_figures(image: np.array):
 
     image = image.copy()
diff --git a/vidocp/utils.py b/vidocp/utils.py
index ee528b4..8b9235f 100644
--- a/vidocp/utils.py
+++ b/vidocp/utils.py
@@ -2,6 +2,7 @@ from collections import namedtuple
 from functools import partial
 
 import cv2
+import numpy as np
 from matplotlib import pyplot as plt
 
 
@@ -141,3 +142,64 @@ def vec_rect_to_xywh(rect):
     w = x2 - x
     h = y2 - y
     return x, y, w, h
+
+
+def remove_primary_text_regions(image):
+    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
+
+    Args:
+        image: Image to remove primary text from.
+
+    Returns:
+        Image with primary text removed.
+
+    References:
+        https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
+    """
+
+    def filter_likely_primary_text_segments(cnts):
+        for c in cnts:
+            area = cv2.contourArea(c)
+            if 800 < area < 15000:
+                yield cv2.boundingRect(c)
+
+    image = image.copy()
+
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
+    close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
+
+    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
+    dilate = cv2.dilate(close, dilate_kernel, iterations=1)
+
+    cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+
+    for rect in filter_likely_primary_text_segments(cnts):
+        x, y, w, h = rect
+        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
+
+    return image
+
+
+def __detect_large_coherent_structures(image: np.array):
+    """Detects large coherent structures on an image.
+
+    References:
+         https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
+    """
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
+
+    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
+    dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
+
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
+    close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
+
+    cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    return cnts
\ No newline at end of file

From aa66b6865b00b0490b9e7695a6bae386e6f96723 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:31:21 +0100
Subject: [PATCH 06/27] refactoring

---
 vidocp/figure_detection.py    | 11 +----------
 vidocp/redaction_detection.py | 17 +----------------
 vidocp/table_parsing.py       |  4 ++--
 vidocp/utils.py               | 22 +++++++++++++++++++++-
 4 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index 830fc97..5cb44ca 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -3,16 +3,7 @@ import numpy as np
 from pdf2image import pdf2image
 
 from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \
-    __detect_large_coherent_structures
-
-
-def is_large_enough(cont, min_area=10000):
-    return cv2.contourArea(cont, False) > min_area
-
-
-def has_acceptable_format(cont, max_width_to_hight_ratio=6):
-    _, _, w, h = cv2.boundingRect(cont)
-    return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio)
+    __detect_large_coherent_structures, is_large_enough, has_acceptable_format
 
 
 def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py
index f1b319a..31cb3b1 100644
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@@ -5,22 +5,7 @@ import numpy as np
 import pdf2image
 from iteration_utilities import starfilter, first
 
-from vidocp.utils import show_mpl, draw_contours
-
-
-def is_filled(hierarchy):
-    # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
-    return hierarchy[3] <= 0 and hierarchy[2] == -1
-
-
-def is_boxy(contour):
-    epsilon = 0.01 * cv2.arcLength(contour, True)
-    approx = cv2.approxPolyDP(contour, epsilon, True)
-    return len(approx) <= 10
-
-
-def is_large_enough(contour, min_area):
-    return cv2.contourArea(contour, False) > min_area
+from vidocp.utils import show_mpl, draw_contours, is_large_enough, is_filled, is_boxy
 
 
 def is_likely_redaction(contour, hierarchy, min_area):
diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 765fb1c..035f569 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -2,7 +2,7 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import show_cv2, draw_stats
+from vidocp.utils import draw_stats, show_mpl
 
 
 def add_external_contours(image, img):
@@ -52,4 +52,4 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
     stats = parse_table(page)
     page = draw_stats(page, stats)
 
-    show_cv2(page)
+    show_mpl(page)
diff --git a/vidocp/utils.py b/vidocp/utils.py
index 8b9235f..752cd3f 100644
--- a/vidocp/utils.py
+++ b/vidocp/utils.py
@@ -202,4 +202,24 @@ def __detect_large_coherent_structures(image: np.array):
 
     cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
-    return cnts
\ No newline at end of file
+    return cnts
+
+
+def is_large_enough(cont, min_area):
+    return cv2.contourArea(cont, False) > min_area
+
+
+def has_acceptable_format(cont, max_width_to_hight_ratio):
+    _, _, w, h = cv2.boundingRect(cont)
+    return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio)
+
+
+def is_filled(hierarchy):
+    # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
+    return hierarchy[3] <= 0 and hierarchy[2] == -1
+
+
+def is_boxy(contour):
+    epsilon = 0.01 * cv2.arcLength(contour, True)
+    approx = cv2.approxPolyDP(contour, epsilon, True)
+    return len(approx) <= 10
\ No newline at end of file

From 89a99d3586db4fbafa743a45bdd02eaf0c1f341f Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:39:49 +0100
Subject: [PATCH 07/27] refactoring

---
 vidocp/utils.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vidocp/utils.py b/vidocp/utils.py
index 752cd3f..a7a50a4 100644
--- a/vidocp/utils.py
+++ b/vidocp/utils.py
@@ -157,11 +157,8 @@ def remove_primary_text_regions(image):
         https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
     """
 
-    def filter_likely_primary_text_segments(cnts):
-        for c in cnts:
-            area = cv2.contourArea(c)
-            if 800 < area < 15000:
-                yield cv2.boundingRect(c)
+    def is_likely_primary_text_segments(cnt):
+        return 800 < cv2.contourArea(cnt) < 15000
 
     image = image.copy()
 
@@ -177,8 +174,8 @@ def remove_primary_text_regions(image):
 
     cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
 
-    for rect in filter_likely_primary_text_segments(cnts):
-        x, y, w, h = rect
+    for cnt in filter(is_likely_primary_text_segments, cnts):
+        x, y, w, h = cv2.boundingRect(cnt)
         cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
 
     return image

From e8863d67aaaff138fb088c4e496a91b6354cc059 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:42:45 +0100
Subject: [PATCH 08/27] refactoring

---
 vidocp/figure_detection.py | 11 +++++++++--
 vidocp/utils.py            | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index 5cb44ca..2bf77db 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -2,8 +2,15 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \
-    __detect_large_coherent_structures, is_large_enough, has_acceptable_format
+from vidocp.utils import (
+    show_mpl,
+    draw_rectangles,
+    remove_included,
+    remove_primary_text_regions,
+    __detect_large_coherent_structures,
+    is_large_enough,
+    has_acceptable_format,
+)
 
 
 def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
diff --git a/vidocp/utils.py b/vidocp/utils.py
index a7a50a4..ff61135 100644
--- a/vidocp/utils.py
+++ b/vidocp/utils.py
@@ -144,8 +144,8 @@ def vec_rect_to_xywh(rect):
     return x, y, w, h
 
 
-def remove_primary_text_regions(image):
-    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
+def find_primary_text_regions(image):
+    """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
 
     Args:
         image: Image to remove primary text from.
@@ -174,7 +174,26 @@ def remove_primary_text_regions(image):
 
     cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
 
-    for cnt in filter(is_likely_primary_text_segments, cnts):
+    cnts = filter(is_likely_primary_text_segments, cnts)
+
+    return cnts
+
+
+def remove_primary_text_regions(image):
+    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
+
+    Args:
+        image: Image to remove primary text from.
+
+    Returns:
+        Image with primary text removed.
+    """
+
+    image = image.copy()
+
+    cnts = find_primary_text_regions(image)
+
+    for cnt in cnts:
         x, y, w, h = cv2.boundingRect(cnt)
         cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
 
@@ -206,17 +225,21 @@ def is_large_enough(cont, min_area):
     return cv2.contourArea(cont, False) > min_area
 
 
-def has_acceptable_format(cont, max_width_to_hight_ratio):
+def has_acceptable_format(cont, max_width_to_height_ratio):
     _, _, w, h = cv2.boundingRect(cont)
-    return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio)
+    return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio)
 
 
 def is_filled(hierarchy):
-    # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
+    """Checks whether a hierarchy is filled.
+
+    References:
+        https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
+    """
     return hierarchy[3] <= 0 and hierarchy[2] == -1
 
 
 def is_boxy(contour):
     epsilon = 0.01 * cv2.arcLength(contour, True)
     approx = cv2.approxPolyDP(contour, epsilon, True)
-    return len(approx) <= 10
\ No newline at end of file
+    return len(approx) <= 10

From 9d30009dceec0357db6499bfaffae8ce97718ee0 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:45:53 +0100
Subject: [PATCH 09/27] refactoring

---
 vidocp/figure_detection.py  | 4 ++--
 vidocp/utils/__init__.py    | 1 +
 vidocp/{ => utils}/utils.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)
 create mode 100644 vidocp/utils/__init__.py
 rename vidocp/{ => utils}/utils.py (99%)

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index 2bf77db..2c58968 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -7,7 +7,7 @@ from vidocp.utils import (
     draw_rectangles,
     remove_included,
     remove_primary_text_regions,
-    __detect_large_coherent_structures,
+    detect_large_coherent_structures,
     is_large_enough,
     has_acceptable_format,
 )
@@ -22,7 +22,7 @@ def detect_figures(image: np.array):
     image = image.copy()
 
     image = remove_primary_text_regions(image)
-    cnts = __detect_large_coherent_structures(image)
+    cnts = detect_large_coherent_structures(image)
 
     cnts = filter(is_likely_figure, cnts)
     rects = map(cv2.boundingRect, cnts)
diff --git a/vidocp/utils/__init__.py b/vidocp/utils/__init__.py
new file mode 100644
index 0000000..90f60fd
--- /dev/null
+++ b/vidocp/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *
\ No newline at end of file
diff --git a/vidocp/utils.py b/vidocp/utils/utils.py
similarity index 99%
rename from vidocp/utils.py
rename to vidocp/utils/utils.py
index ff61135..1802fca 100644
--- a/vidocp/utils.py
+++ b/vidocp/utils/utils.py
@@ -200,7 +200,7 @@ def remove_primary_text_regions(image):
     return image
 
 
-def __detect_large_coherent_structures(image: np.array):
+def detect_large_coherent_structures(image: np.array):
     """Detects large coherent structures on an image.
 
     References:

From d9567da428c81f9cd7971a657281df0a90166810 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:47:18 +0100
Subject: [PATCH 10/27] refactoring

---
 vidocp/figure_detection.py |  2 +-
 vidocp/utils/text.py       | 57 ++++++++++++++++++++++++++++++++++++++
 vidocp/utils/utils.py      | 56 -------------------------------------
 3 files changed, 58 insertions(+), 57 deletions(-)
 create mode 100644 vidocp/utils/text.py

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index 2c58968..42ded9b 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -6,11 +6,11 @@ from vidocp.utils import (
     show_mpl,
     draw_rectangles,
     remove_included,
-    remove_primary_text_regions,
     detect_large_coherent_structures,
     is_large_enough,
     has_acceptable_format,
 )
+from vidocp.utils.text import remove_primary_text_regions
 
 
 def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6):
diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py
new file mode 100644
index 0000000..3a7d2b1
--- /dev/null
+++ b/vidocp/utils/text.py
@@ -0,0 +1,57 @@
+import cv2
+
+
+def remove_primary_text_regions(image):
+    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
+
+    Args:
+        image: Image to remove primary text from.
+
+    Returns:
+        Image with primary text removed.
+    """
+
+    image = image.copy()
+
+    cnts = find_primary_text_regions(image)
+
+    for cnt in cnts:
+        x, y, w, h = cv2.boundingRect(cnt)
+        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
+
+    return image
+
+
+def find_primary_text_regions(image):
+    """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
+
+    Args:
+        image: Image to remove primary text from.
+
+    Returns:
+        Image with primary text removed.
+
+    References:
+        https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
+    """
+
+    def is_likely_primary_text_segments(cnt):
+        return 800 < cv2.contourArea(cnt) < 15000
+
+    image = image.copy()
+
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
+    close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
+
+    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
+    dilate = cv2.dilate(close, dilate_kernel, iterations=1)
+
+    cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+
+    cnts = filter(is_likely_primary_text_segments, cnts)
+
+    return cnts
\ No newline at end of file
diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py
index 1802fca..2121804 100644
--- a/vidocp/utils/utils.py
+++ b/vidocp/utils/utils.py
@@ -144,62 +144,6 @@ def vec_rect_to_xywh(rect):
     return x, y, w, h
 
 
-def find_primary_text_regions(image):
-    """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
-
-    Args:
-        image: Image to remove primary text from.
-
-    Returns:
-        Image with primary text removed.
-
-    References:
-        https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
-    """
-
-    def is_likely_primary_text_segments(cnt):
-        return 800 < cv2.contourArea(cnt) < 15000
-
-    image = image.copy()
-
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-
-    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
-    close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1)
-
-    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3))
-    dilate = cv2.dilate(close, dilate_kernel, iterations=1)
-
-    cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
-
-    cnts = filter(is_likely_primary_text_segments, cnts)
-
-    return cnts
-
-
-def remove_primary_text_regions(image):
-    """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
-
-    Args:
-        image: Image to remove primary text from.
-
-    Returns:
-        Image with primary text removed.
-    """
-
-    image = image.copy()
-
-    cnts = find_primary_text_regions(image)
-
-    for cnt in cnts:
-        x, y, w, h = cv2.boundingRect(cnt)
-        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
-
-    return image
-
-
 def detect_large_coherent_structures(image: np.array):
     """Detects large coherent structures on an image.
 

From e652da1fa88a048f9a5211b4e8c0b96074fb5849 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:53:17 +0100
Subject: [PATCH 11/27] refactoring

---
 vidocp/figure_detection.py      |  13 +--
 vidocp/layout_parsing.py        |   4 +-
 vidocp/redaction_detection.py   |   4 +-
 vidocp/table_parsing.py         |   3 +-
 vidocp/utils/__init__.py        |   2 +-
 vidocp/utils/detection.py       |  23 +++++
 vidocp/utils/display.py         |  16 +++
 vidocp/utils/draw.py            |  56 ++++++++++
 vidocp/utils/filters.py         |  25 +++++
 vidocp/utils/post_processing.py |  62 +++++++++++
 vidocp/utils/text.py            |   2 +-
 vidocp/utils/utils.py           | 177 --------------------------------
 12 files changed, 197 insertions(+), 190 deletions(-)
 create mode 100644 vidocp/utils/detection.py
 create mode 100644 vidocp/utils/display.py
 create mode 100644 vidocp/utils/draw.py
 create mode 100644 vidocp/utils/filters.py
 create mode 100644 vidocp/utils/post_processing.py

diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py
index 42ded9b..27a8eb2 100644
--- a/vidocp/figure_detection.py
+++ b/vidocp/figure_detection.py
@@ -2,14 +2,11 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import (
-    show_mpl,
-    draw_rectangles,
-    remove_included,
-    detect_large_coherent_structures,
-    is_large_enough,
-    has_acceptable_format,
-)
+from vidocp.utils.detection import detect_large_coherent_structures
+from vidocp.utils.display import show_mpl
+from vidocp.utils.draw import draw_rectangles
+from vidocp.utils.post_processing import remove_included
+from vidocp.utils.filters import is_large_enough, has_acceptable_format
 from vidocp.utils.text import remove_primary_text_regions
 
 
diff --git a/vidocp/layout_parsing.py b/vidocp/layout_parsing.py
index 67cd89e..b5f1c51 100644
--- a/vidocp/layout_parsing.py
+++ b/vidocp/layout_parsing.py
@@ -6,7 +6,9 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import draw_rectangles, show_mpl, remove_overlapping, remove_included, has_no_parent
+from vidocp.utils.display import show_mpl
+from vidocp.utils.draw import draw_rectangles
+from vidocp.utils.post_processing import remove_overlapping, remove_included, has_no_parent
 
 
 def is_likely_segment(rect, min_area=100):
diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py
index 31cb3b1..1843f60 100644
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@@ -5,7 +5,9 @@ import numpy as np
 import pdf2image
 from iteration_utilities import starfilter, first
 
-from vidocp.utils import show_mpl, draw_contours, is_large_enough, is_filled, is_boxy
+from vidocp.utils.display import show_mpl
+from vidocp.utils.draw import draw_contours
+from vidocp.utils.filters import is_large_enough, is_filled, is_boxy
 
 
 def is_likely_redaction(contour, hierarchy, min_area):
diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 035f569..c991d43 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -2,7 +2,8 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils import draw_stats, show_mpl
+from vidocp.utils.display import show_mpl
+from vidocp.utils.draw import draw_stats
 
 
 def add_external_contours(image, img):
diff --git a/vidocp/utils/__init__.py b/vidocp/utils/__init__.py
index 90f60fd..16281fe 100644
--- a/vidocp/utils/__init__.py
+++ b/vidocp/utils/__init__.py
@@ -1 +1 @@
-from .utils import *
\ No newline at end of file
+from .utils import *
diff --git a/vidocp/utils/detection.py b/vidocp/utils/detection.py
new file mode 100644
index 0000000..e5d8266
--- /dev/null
+++ b/vidocp/utils/detection.py
@@ -0,0 +1,23 @@
+import cv2
+import numpy as np
+
+
+def detect_large_coherent_structures(image: np.array):
+    """Detects large coherent structures on an image.
+
+    References:
+         https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
+    """
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
+
+    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
+    dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
+
+    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
+    close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
+
+    cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    return cnts
diff --git a/vidocp/utils/display.py b/vidocp/utils/display.py
new file mode 100644
index 0000000..e0cb8ab
--- /dev/null
+++ b/vidocp/utils/display.py
@@ -0,0 +1,16 @@
+import cv2
+from matplotlib import pyplot as plt
+
+
+def show_mpl(image):
+
+    fig, ax = plt.subplots(1, 1)
+    fig.set_size_inches(20, 20)
+    ax.imshow(image)
+    plt.show()
+
+
+def show_cv2(image):
+
+    cv2.imshow("", image)
+    cv2.waitKey(0)
diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py
new file mode 100644
index 0000000..32c66f6
--- /dev/null
+++ b/vidocp/utils/draw.py
@@ -0,0 +1,56 @@
+import cv2
+
+from vidocp.utils import copy_and_normalize_channels
+
+
+def draw_contours(image, contours):
+
+    image = copy_and_normalize_channels(image)
+
+    for cont in contours:
+        cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
+
+    return image
+
+
+def draw_rectangles(image, rectangles, color=None):
+
+    image = copy_and_normalize_channels(image)
+
+    if not color:
+        color = (0, 255, 0)
+
+    for rect in rectangles:
+        x, y, w, h = rect
+        cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
+
+    return image
+
+
+def draw_stats(image, stats, annotate=False):
+
+    image = copy_and_normalize_channels(image)
+
+    keys = ["x", "y", "w", "h"]
+
+    def annotate_stat(x, y, w, h):
+
+        for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
+            anno = f"{s} = {v}"
+            xann = int(x + 5)
+            yann = int(y + h - (20 * (i + 1)))
+            cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
+
+    def draw_stat(stat):
+
+        x, y, w, h, area = stat
+
+        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
+
+        if annotate:
+            annotate_stat(x, y, w, h)
+
+    for stat in stats[2:]:
+        draw_stat(stat)
+
+    return image
diff --git a/vidocp/utils/filters.py b/vidocp/utils/filters.py
new file mode 100644
index 0000000..274925c
--- /dev/null
+++ b/vidocp/utils/filters.py
@@ -0,0 +1,25 @@
+import cv2
+
+
+def is_large_enough(cont, min_area):
+    return cv2.contourArea(cont, False) > min_area
+
+
+def has_acceptable_format(cont, max_width_to_height_ratio):
+    _, _, w, h = cv2.boundingRect(cont)
+    return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio)
+
+
+def is_filled(hierarchy):
+    """Checks whether a hierarchy is filled.
+
+    References:
+        https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
+    """
+    return hierarchy[3] <= 0 and hierarchy[2] == -1
+
+
+def is_boxy(contour):
+    epsilon = 0.01 * cv2.arcLength(contour, True)
+    approx = cv2.approxPolyDP(contour, epsilon, True)
+    return len(approx) <= 10
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
new file mode 100644
index 0000000..0cd7f62
--- /dev/null
+++ b/vidocp/utils/post_processing.py
@@ -0,0 +1,62 @@
+from collections import namedtuple
+from functools import partial
+
+
+def remove_overlapping(rectangles):
+    def overlap(a, b):
+        return compute_intersection(a, b) > 0
+
+    def does_not_overlap(rect, rectangles):
+        return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2)
+
+    rectangles = list(map(xywh_to_vec_rect, rectangles))
+    rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles)
+    rectangles = map(vec_rect_to_xywh, rectangles)
+    return rectangles
+
+
+def remove_included(rectangles):
+    def included(a, b):
+        return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax
+
+    def is_not_included(rect, rectangles):
+        return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
+
+    rectangles = list(map(xywh_to_vec_rect, rectangles))
+    rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
+    rectangles = map(vec_rect_to_xywh, rectangles)
+    return rectangles
+
+
+Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
+
+
+def make_box(x1, y1, x2, y2):
+    keys = "x1", "y1", "x2", "y2"
+    return dict(zip(keys, [x1, y1, x2, y2]))
+
+
+def compute_intersection(a, b):
+
+    dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
+    dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
+
+    return dx * dy if (dx >= 0) and (dy >= 0) else 0
+
+
+def has_no_parent(hierarchy):
+    return hierarchy[-1] <= 0
+
+
+def xywh_to_vec_rect(rect):
+    x1, y1, w, h = rect
+    x2 = x1 + w
+    y2 = y1 + h
+    return Rectangle(x1, y1, x2, y2)
+
+
+def vec_rect_to_xywh(rect):
+    x, y, x2, y2 = rect
+    w = x2 - x
+    h = y2 - y
+    return x, y, w, h
diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py
index 3a7d2b1..4189005 100644
--- a/vidocp/utils/text.py
+++ b/vidocp/utils/text.py
@@ -54,4 +54,4 @@ def find_primary_text_regions(image):
 
     cnts = filter(is_likely_primary_text_segments, cnts)
 
-    return cnts
\ No newline at end of file
+    return cnts
diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py
index 2121804..18c8eb2 100644
--- a/vidocp/utils/utils.py
+++ b/vidocp/utils/utils.py
@@ -1,23 +1,4 @@
-from collections import namedtuple
-from functools import partial
-
 import cv2
-import numpy as np
-from matplotlib import pyplot as plt
-
-
-def show_mpl(image):
-
-    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
-    ax.imshow(image)
-    plt.show()
-
-
-def show_cv2(image):
-
-    cv2.imshow("", image)
-    cv2.waitKey(0)
 
 
 def copy_and_normalize_channels(image):
@@ -29,161 +10,3 @@ def copy_and_normalize_channels(image):
         pass
 
     return image
-
-
-def draw_contours(image, contours):
-
-    image = copy_and_normalize_channels(image)
-
-    for cont in contours:
-        cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
-
-    return image
-
-
-def draw_rectangles(image, rectangles, color=None):
-
-    image = copy_and_normalize_channels(image)
-
-    if not color:
-        color = (0, 255, 0)
-
-    for rect in rectangles:
-        x, y, w, h = rect
-        cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
-
-    return image
-
-
-def draw_stats(image, stats, annotate=False):
-
-    image = copy_and_normalize_channels(image)
-
-    keys = ["x", "y", "w", "h"]
-
-    def annotate_stat(x, y, w, h):
-
-        for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
-            anno = f"{s} = {v}"
-            xann = int(x + 5)
-            yann = int(y + h - (20 * (i + 1)))
-            cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
-
-    def draw_stat(stat):
-
-        x, y, w, h, area = stat
-
-        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
-
-        if annotate:
-            annotate_stat(x, y, w, h)
-
-    for stat in stats[2:]:
-        draw_stat(stat)
-
-    return image
-
-
-def remove_overlapping(rectangles):
-    def overlap(a, b):
-        return compute_intersection(a, b) > 0
-
-    def does_not_overlap(rect, rectangles):
-        return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2)
-
-    rectangles = list(map(xywh_to_vec_rect, rectangles))
-    rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles)
-    rectangles = map(vec_rect_to_xywh, rectangles)
-    return rectangles
-
-
-def remove_included(rectangles):
-    def included(a, b):
-        return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax
-
-    def is_not_included(rect, rectangles):
-        return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
-
-    rectangles = list(map(xywh_to_vec_rect, rectangles))
-    rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)
-    rectangles = map(vec_rect_to_xywh, rectangles)
-    return rectangles
-
-
-Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
-
-
-def make_box(x1, y1, x2, y2):
-    keys = "x1", "y1", "x2", "y2"
-    return dict(zip(keys, [x1, y1, x2, y2]))
-
-
-def compute_intersection(a, b):
-
-    dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
-    dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
-
-    return dx * dy if (dx >= 0) and (dy >= 0) else 0
-
-
-def has_no_parent(hierarchy):
-    return hierarchy[-1] <= 0
-
-
-def xywh_to_vec_rect(rect):
-    x1, y1, w, h = rect
-    x2 = x1 + w
-    y2 = y1 + h
-    return Rectangle(x1, y1, x2, y2)
-
-
-def vec_rect_to_xywh(rect):
-    x, y, x2, y2 = rect
-    w = x2 - x
-    h = y2 - y
-    return x, y, w, h
-
-
-def detect_large_coherent_structures(image: np.array):
-    """Detects large coherent structures on an image.
-
-    References:
-         https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
-    """
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-    thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1]
-
-    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
-    dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4)
-
-    close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
-    close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
-
-    cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
-    return cnts
-
-
-def is_large_enough(cont, min_area):
-    return cv2.contourArea(cont, False) > min_area
-
-
-def has_acceptable_format(cont, max_width_to_height_ratio):
-    _, _, w, h = cv2.boundingRect(cont)
-    return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio)
-
-
-def is_filled(hierarchy):
-    """Checks whether a hierarchy is filled.
-
-    References:
-        https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv
-    """
-    return hierarchy[3] <= 0 and hierarchy[2] == -1
-
-
-def is_boxy(contour):
-    epsilon = 0.01 * cv2.arcLength(contour, True)
-    approx = cv2.approxPolyDP(contour, epsilon, True)
-    return len(approx) <= 10

From 36a62a13e51148d2420cb12930e84d78629db6b0 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 14:54:53 +0100
Subject: [PATCH 12/27] refactoring

---
 scripts/annotate.py           | 4 ++--
 vidocp/redaction_detection.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/annotate.py b/scripts/annotate.py
index 682b8ad..9ef1bce 100644
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@@ -1,7 +1,7 @@
 import argparse
 
 from vidocp.table_parsing import annotate_tables_in_pdf
-from vidocp.redaction_detection import annotate_boxes_in_pdf
+from vidocp.redaction_detection import annotate_redactions_in_pdf
 from vidocp.layout_parsing import annotate_layout_in_pdf
 from vidocp.figure_detection import detect_figures_in_pdf
 
@@ -22,7 +22,7 @@ if __name__ == "__main__":
     if args.type == "table":
         annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
     elif args.type == "redaction":
-        annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index)
+        annotate_redactions_in_pdf(args.pdf_path, page_index=args.page_index)
     elif args.type == "layout":
         annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index)
     elif args.type == "figure":
diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py
index 1843f60..3362dc6 100644
--- a/vidocp/redaction_detection.py
+++ b/vidocp/redaction_detection.py
@@ -30,7 +30,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
     return contours
 
 
-def annotate_boxes_in_pdf(pdf_path, page_index=1):
+def annotate_redactions_in_pdf(pdf_path, page_index=1):
 
     page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
     page = np.array(page)

From 106b333dca49780368c96400956f3b8186754f52 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 16:44:07 +0100
Subject: [PATCH 13/27] filtering for connected cells... but does not quite
 work yet

---
 vidocp/table_parsing.py         | 27 ++++++++++++++++++++++-----
 vidocp/utils/post_processing.py | 24 +++++++++++++++++++-----
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index c991d43..5b811b8 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -1,14 +1,20 @@
+from functools import partial
+
 import cv2
 import numpy as np
 from pdf2image import pdf2image
 
 from vidocp.utils.display import show_mpl
-from vidocp.utils.draw import draw_stats
+from vidocp.utils.draw import draw_stats, draw_rectangles
+from vidocp.utils.filters import is_large_enough
+from vidocp.utils.post_processing import remove_isolated
 
 
 def add_external_contours(image, img):
 
-    contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+    contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+
+    # contours = filter(partial(is_large_enough, min_area=5000000), contours)
 
     for cnt in contours:
         x, y, w, h = cv2.boundingRect(cnt)
@@ -40,9 +46,20 @@ def parse_table(image: np.array):
     img_bin = isolate_vertical_and_horizontal_components(img_bin)
     img_bin_final = add_external_contours(img_bin, img_bin)
 
-    _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
+    _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
 
-    return stats
+    def is_large_enough(stat):
+        x1, y1, w, h, area = stat
+        return area > 3000
+
+    stats = np.vstack(list(filter(is_large_enough, stats)))
+
+    rects = stats[:, :-1][2:]
+
+    # FIXME: For some reason some isolated rects remain.
+    rects = remove_isolated(rects)
+
+    return rects
 
 
 def annotate_tables_in_pdf(pdf_path, page_index=1):
@@ -51,6 +68,6 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
     page = np.array(page)
 
     stats = parse_table(page)
-    page = draw_stats(page, stats)
+    page = draw_rectangles(page, stats)
 
     show_mpl(page)
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 0cd7f62..c05ab03 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -1,5 +1,6 @@
 from collections import namedtuple
 from functools import partial
+from itertools import starmap
 
 
 def remove_overlapping(rectangles):
@@ -28,14 +29,27 @@ def remove_included(rectangles):
     return rectangles
 
 
+# FIXME: For some reason some isolated rects remain.
+def remove_isolated(rectangles):
+    def are_neighbours(a, b):
+
+        def adjacent(n, m):
+            return abs(n - m) <= 1
+
+        return any(starmap(adjacent, [(b.xmin,  a.xmax), (b.ymin, a.ymax), (b.xmax, a.xmin), (b.ymax, a.ymin)]))
+
+    def is_connected(rect, rectangles):
+        return any(are_neighbours(r2, rect) for r2 in rectangles if not rect == r2)
+
+    rectangles = list(map(xywh_to_vec_rect, rectangles))
+    rectangles = filter(partial(is_connected, rectangles=rectangles), rectangles)
+    rectangles = map(vec_rect_to_xywh, rectangles)
+    return rectangles
+
+
 Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
 
 
-def make_box(x1, y1, x2, y2):
-    keys = "x1", "y1", "x2", "y2"
-    return dict(zip(keys, [x1, y1, x2, y2]))
-
-
 def compute_intersection(a, b):
 
     dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)

From 0fc6cf8008b9a5860789eeda38a7cae03764f290 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 20:00:38 +0100
Subject: [PATCH 14/27] fixed bug in adjaceny test

---
 vidocp/table_parsing.py         |  2 +-
 vidocp/utils/post_processing.py | 69 ++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 5b811b8..c23aa5b 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -57,7 +57,7 @@ def parse_table(image: np.array):
     rects = stats[:, :-1][2:]
 
     # FIXME: For some reason some isolated rects remain.
-    rects = remove_isolated(rects)
+    rects = remove_isolated(rects, input_sorted=True)
 
     return rects
 
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index c05ab03..13f3149 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -1,6 +1,6 @@
 from collections import namedtuple
 from functools import partial
-from itertools import starmap
+from itertools import starmap, compress
 
 
 def remove_overlapping(rectangles):
@@ -29,24 +29,73 @@ def remove_included(rectangles):
     return rectangles
 
 
+def adjacent1d(n, m, tolerance=1):
+    return abs(n - m) <= tolerance
+
+
+def adjacent(a, b):
+    """Two rects (v1, v2), (w1, w2) are adjacent if either of:
+    - the x components of v2 and w1 match and the y components of w1 or w2 are in the range of the y components of v1 and v2
+    - the x components of v1 and w2 match and the y components of w1 or w2 are in the range of the y components of v1 and v2
+    - the y components of v2 and w1 match and the x components of w1 or w2 are in the range of the x components of v1 and v2
+    - the y components of v1 and w2 match and the x components of w1 or w2 are in the range of the x components of v1 and v2
+    """
+
+    def adjacent2d(g, h, i, j, k, l):
+        return adjacent1d(g, h) and any(k <= p <= l for p in [i, j])
+
+    if any(x is None for x in (a, b)):
+        return False
+
+    v1 = a.xmin, a.ymin
+    v2 = a.xmax, a.ymax
+
+    w1 = b.xmin, b.ymin
+    w2 = b.xmax, b.ymax
+
+    return any(
+        (
+            adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
+            adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
+            adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
+            adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
+        )
+    )
+
+
 # FIXME: For some reason some isolated rects remain.
-def remove_isolated(rectangles):
-    def are_neighbours(a, b):
-
-        def adjacent(n, m):
-            return abs(n - m) <= 1
-
-        return any(starmap(adjacent, [(b.xmin,  a.xmax), (b.ymin, a.ymax), (b.xmax, a.xmin), (b.ymax, a.ymin)]))
-
+def __remove_isolated_unsorted(rectangles):
     def is_connected(rect, rectangles):
-        return any(are_neighbours(r2, rect) for r2 in rectangles if not rect == r2)
+        return any(adjacent(r2, rect) for r2 in rectangles if not rect == r2)
 
     rectangles = list(map(xywh_to_vec_rect, rectangles))
     rectangles = filter(partial(is_connected, rectangles=rectangles), rectangles)
     rectangles = map(vec_rect_to_xywh, rectangles)
+
     return rectangles
 
 
+def __remove_isolated_sorted(rectangles):
+    def is_connected(left, center, right):
+        # if center == Rectangle(xmin=337, ymin=154, xmax=512, ymax=187) or center == Rectangle(xmin=719, ymin=188, xmax=781, ymax=251):
+        return any(starmap(adjacent, [(left, center), (center, right)]))
+
+    rectangles = list(map(xywh_to_vec_rect, rectangles))
+
+    lefts = [None, *rectangles[:-1]]
+    rights = [*rectangles[1:], None]
+
+    mask = starmap(is_connected, zip(lefts, rectangles, rights))
+    rectangles = compress(rectangles, mask)
+    rectangles = map(vec_rect_to_xywh, rectangles)
+
+    return rectangles
+
+
+def remove_isolated(rectangles, input_sorted=False):
+    return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(rectangles)
+
+
 Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
 
 

From 36284f9a78a5aecbe893a2f3f66de7a2054a7a3d Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 20:01:00 +0100
Subject: [PATCH 15/27] removed obsolete lines

---
 vidocp/utils/post_processing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 13f3149..f390b03 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -77,7 +77,6 @@ def __remove_isolated_unsorted(rectangles):
 
 def __remove_isolated_sorted(rectangles):
     def is_connected(left, center, right):
-        # if center == Rectangle(xmin=337, ymin=154, xmax=512, ymax=187) or center == Rectangle(xmin=719, ymin=188, xmax=781, ymax=251):
         return any(starmap(adjacent, [(left, center), (center, right)]))
 
     rectangles = list(map(xywh_to_vec_rect, rectangles))

From 90b8613bf8677901e81ca5ee72ab0d80fae97c3f Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 21:03:40 +0100
Subject: [PATCH 16/27] filtering non-tables by bounding rect check WIP

---
 vidocp/table_parsing.py         | 56 +++++++++++++++++++++++++++------
 vidocp/utils/draw.py            | 12 +++++--
 vidocp/utils/post_processing.py | 13 +++++++-
 3 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index c23aa5b..c4d8485 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -1,4 +1,6 @@
 from functools import partial
+from itertools import chain, starmap
+from operator import attrgetter
 
 import cv2
 import numpy as np
@@ -7,7 +9,7 @@ from pdf2image import pdf2image
 from vidocp.utils.display import show_mpl
 from vidocp.utils.draw import draw_stats, draw_rectangles
 from vidocp.utils.filters import is_large_enough
-from vidocp.utils.post_processing import remove_isolated
+from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d
 
 
 def add_external_contours(image, img):
@@ -37,8 +39,46 @@ def isolate_vertical_and_horizontal_components(img_bin):
     return img_bin_final
 
 
+def has_table_shape(rects):
+
+    assert isinstance(rects, list)
+
+    points = list(chain(*map(xywh_to_vecs, rects)))
+    brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
+
+    rects = list(map(xywh_to_vec_rect, rects))
+
+    # print(rects)
+    # print(brect)
+
+    def matches_bounding_rect_corner(rect, x, y):
+        corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
+        ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
+        # print()
+        # print(x, y)
+        # print(brect)
+        # print(rect)
+        # print(corresp_coords)
+        # print(ret)
+
+        return ret
+
+    return all(
+        (
+            any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
+            any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
+            any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
+            any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
+        )
+    )
+
+
 def parse_table(image: np.array):
 
+    def is_large_enough(stat):
+        x1, y1, w, h, area = stat
+        return area > 3000
+
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
@@ -48,16 +88,14 @@ def parse_table(image: np.array):
 
     _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
 
-    def is_large_enough(stat):
-        x1, y1, w, h, area = stat
-        return area > 3000
-
     stats = np.vstack(list(filter(is_large_enough, stats)))
-
     rects = stats[:, :-1][2:]
+    rects = list(remove_isolated(rects, input_sorted=True))
 
-    # FIXME: For some reason some isolated rects remain.
-    rects = remove_isolated(rects, input_sorted=True)
+    # print(f"{has_table_shape(rects) = }")
+    # if not has_table_shape(rects):
+    #     print(111111111111111111111)
+    #     return []
 
     return rects
 
@@ -68,6 +106,6 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
     page = np.array(page)
 
     stats = parse_table(page)
-    page = draw_rectangles(page, stats)
+    page = draw_rectangles(page, stats, annotate=True)
 
     show_mpl(page)
diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py
index 32c66f6..13e6a73 100644
--- a/vidocp/utils/draw.py
+++ b/vidocp/utils/draw.py
@@ -13,7 +13,10 @@ def draw_contours(image, contours):
     return image
 
 
-def draw_rectangles(image, rectangles, color=None):
+def draw_rectangles(image, rectangles, color=None, annotate=False):
+
+    def annotate_rect(x, y, w, h):
+        cv2.putText(image, "+", (x + (w // 2) - 12, y + (h // 2) + 9), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
 
     image = copy_and_normalize_channels(image)
 
@@ -24,13 +27,14 @@ def draw_rectangles(image, rectangles, color=None):
         x, y, w, h = rect
         cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
 
+        if annotate:
+            annotate_rect(x, y, w, h)
+
     return image
 
 
 def draw_stats(image, stats, annotate=False):
 
-    image = copy_and_normalize_channels(image)
-
     keys = ["x", "y", "w", "h"]
 
     def annotate_stat(x, y, w, h):
@@ -50,6 +54,8 @@ def draw_stats(image, stats, annotate=False):
         if annotate:
             annotate_stat(x, y, w, h)
 
+    image = copy_and_normalize_channels(image)
+
     for stat in stats[2:]:
         draw_stat(stat)
 
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index f390b03..a3f1272 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -111,10 +111,21 @@ def has_no_parent(hierarchy):
 
 
 def xywh_to_vec_rect(rect):
+    v1, v2 = xywh_to_vecs(rect)
+    return Rectangle(*v1, *v2)
+
+
+def vecs_to_vec_rect(rect):
+    print(rect)
+    v1, v2 = rect
+    return Rectangle(*v1, *v2)
+
+
+def xywh_to_vecs(rect):
     x1, y1, w, h = rect
     x2 = x1 + w
     y2 = y1 + h
-    return Rectangle(x1, y1, x2, y2)
+    return (x1, y1), (x2, y2)
 
 
 def vec_rect_to_xywh(rect):

From 295666c28f2a2b24783d0a27f0d71631a7c9a7d2 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 21:25:01 +0100
Subject: [PATCH 17/27] added todo comments

---
 vidocp/table_parsing.py         | 5 ++++-
 vidocp/utils/post_processing.py | 7 ++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index c4d8485..85989e7 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -6,7 +6,7 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils.display import show_mpl
+from vidocp.utils.display import show_mpl, show_cv2
 from vidocp.utils.draw import draw_stats, draw_rectangles
 from vidocp.utils.filters import is_large_enough
 from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d
@@ -39,6 +39,7 @@ def isolate_vertical_and_horizontal_components(img_bin):
     return img_bin_final
 
 
+# FIXME: does not work yet
 def has_table_shape(rects):
 
     assert isinstance(rects, list)
@@ -83,6 +84,7 @@ def parse_table(image: np.array):
     th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
+
     img_bin = isolate_vertical_and_horizontal_components(img_bin)
     img_bin_final = add_external_contours(img_bin, img_bin)
 
@@ -90,6 +92,7 @@ def parse_table(image: np.array):
 
     stats = np.vstack(list(filter(is_large_enough, stats)))
     rects = stats[:, :-1][2:]
+    # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
     rects = list(remove_isolated(rects, input_sorted=True))
 
     # print(f"{has_table_shape(rects) = }")
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index a3f1272..06dc1d9 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -17,11 +17,12 @@ def remove_overlapping(rectangles):
 
 
 def remove_included(rectangles):
-    def included(a, b):
-        return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax
+    def includes(a, b, tol=3):
+        """does a include b?"""
+        return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax
 
     def is_not_included(rect, rectangles):
-        return not any(included(r2, rect) for r2 in rectangles if not rect == r2)
+        return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)
 
     rectangles = list(map(xywh_to_vec_rect, rectangles))
     rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles)

From 87cecadb440b9437aca1b968b5a4e92c6023c24f Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Sun, 6 Feb 2022 21:27:39 +0100
Subject: [PATCH 18/27] applied black

---
 vidocp/table_parsing.py | 2 --
 vidocp/utils/draw.py    | 1 -
 2 files changed, 3 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 85989e7..580b2f3 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -75,7 +75,6 @@ def has_table_shape(rects):
 
 
 def parse_table(image: np.array):
-
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
         return area > 3000
@@ -84,7 +83,6 @@ def parse_table(image: np.array):
     th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
-
     img_bin = isolate_vertical_and_horizontal_components(img_bin)
     img_bin_final = add_external_contours(img_bin, img_bin)
 
diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py
index 13e6a73..7b23f0d 100644
--- a/vidocp/utils/draw.py
+++ b/vidocp/utils/draw.py
@@ -14,7 +14,6 @@ def draw_contours(image, contours):
 
 
 def draw_rectangles(image, rectangles, color=None, annotate=False):
-
     def annotate_rect(x, y, w, h):
         cv2.putText(image, "+", (x + (w // 2) - 12, y + (h // 2) + 9), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
 

From f7d3e396921c04f7d463db7d7b858f7294313937 Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Tue, 8 Feb 2022 15:05:12 +0100
Subject: [PATCH 19/27] nix dolles

---
 vidocp/layout_detection.py |  1 +
 vidocp/table_parsing.py    |  7 +++----
 vidocp/utils/draw.py       | 29 -----------------------------
 3 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/vidocp/layout_detection.py b/vidocp/layout_detection.py
index d559df0..1d49684 100644
--- a/vidocp/layout_detection.py
+++ b/vidocp/layout_detection.py
@@ -23,6 +23,7 @@ def find_layout_boxes(image: np.array):
 
     contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     contours = imutils.grab_contours(contours)
+
     for c in contours:
         peri = cv2.arcLength(c, True)
         approx = cv2.approxPolyDP(c, 0.04 * peri, True)
diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 580b2f3..2ead96c 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -6,10 +6,9 @@ import cv2
 import numpy as np
 from pdf2image import pdf2image
 
-from vidocp.utils.display import show_mpl, show_cv2
-from vidocp.utils.draw import draw_stats, draw_rectangles
-from vidocp.utils.filters import is_large_enough
-from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d
+from vidocp.utils.display import show_mpl
+from vidocp.utils.draw import draw_rectangles
+from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
 
 
 def add_external_contours(image, img):
diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py
index 7b23f0d..2f7ef06 100644
--- a/vidocp/utils/draw.py
+++ b/vidocp/utils/draw.py
@@ -30,32 +30,3 @@ def draw_rectangles(image, rectangles, color=None, annotate=False):
             annotate_rect(x, y, w, h)
 
     return image
-
-
-def draw_stats(image, stats, annotate=False):
-
-    keys = ["x", "y", "w", "h"]
-
-    def annotate_stat(x, y, w, h):
-
-        for i, (s, v) in enumerate(zip(keys, [x, y, w, h])):
-            anno = f"{s} = {v}"
-            xann = int(x + 5)
-            yann = int(y + h - (20 * (i + 1)))
-            cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
-
-    def draw_stat(stat):
-
-        x, y, w, h, area = stat
-
-        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
-
-        if annotate:
-            annotate_stat(x, y, w, h)
-
-    image = copy_and_normalize_channels(image)
-
-    for stat in stats[2:]:
-        draw_stat(stat)
-
-    return image

From 4964c8f5a154a24e967665d169fd72e4e9673538 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Thu, 10 Feb 2022 10:22:22 +0100
Subject: [PATCH 20/27] some changes to fix some minor bugs in table_parsing.py
 and post_processing.py

---
 vidocp/table_parsing.py         | 10 ++++++++--
 vidocp/utils/post_processing.py | 18 +++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 2ead96c..0131c3c 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -76,10 +76,13 @@ def has_table_shape(rects):
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
-        return area > 3000
+        #  was set too higg (3000): Boxes in a Table can definetly be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
+        #  with extra condition for the length of height and width weirdly narrow rectangles can be filtered
+        return area > 500 and w > 35 and h > 15
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
+    #changed threshold value from 150 to 200 b
+    th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
     img_bin = isolate_vertical_and_horizontal_components(img_bin)
@@ -88,9 +91,12 @@ def parse_table(image: np.array):
     _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
 
     stats = np.vstack(list(filter(is_large_enough, stats)))
+    print(stats)
     rects = stats[:, :-1][2:]
+
     # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
     rects = list(remove_isolated(rects, input_sorted=True))
+    print(rects)
 
     # print(f"{has_table_shape(rects) = }")
     # if not has_table_shape(rects):
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 06dc1d9..77f8cab 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -29,8 +29,8 @@ def remove_included(rectangles):
     rectangles = map(vec_rect_to_xywh, rectangles)
     return rectangles
 
-
-def adjacent1d(n, m, tolerance=1):
+#tolerance was set too low (1) most lines are 2px wide
+def adjacent1d(n, m, tolerance=2):
     return abs(n - m) <= tolerance
 
 
@@ -43,6 +43,7 @@ def adjacent(a, b):
     """
 
     def adjacent2d(g, h, i, j, k, l):
+        #print(abs(g-h), [k <= p <= l for p in [i, j]])
         return adjacent1d(g, h) and any(k <= p <= l for p in [i, j])
 
     if any(x is None for x in (a, b)):
@@ -50,10 +51,17 @@ def adjacent(a, b):
 
     v1 = a.xmin, a.ymin
     v2 = a.xmax, a.ymax
-
+    print("topleft and bottom right rec1", v1,v2)
     w1 = b.xmin, b.ymin
     w2 = b.xmax, b.ymax
-
+    print("topleft and bottom right rec2", w1, w2)
+    # some rectangles are compared twice
+    print((
+            adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
+            adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
+            adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
+            adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
+        ))
     return any(
         (
             adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
@@ -81,6 +89,7 @@ def __remove_isolated_sorted(rectangles):
         return any(starmap(adjacent, [(left, center), (center, right)]))
 
     rectangles = list(map(xywh_to_vec_rect, rectangles))
+    # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles)
 
     lefts = [None, *rectangles[:-1]]
     rights = [*rectangles[1:], None]
@@ -117,7 +126,6 @@ def xywh_to_vec_rect(rect):
 
 
 def vecs_to_vec_rect(rect):
-    print(rect)
     v1, v2 = rect
     return Rectangle(*v1, *v2)
 

From 07907d45dd3c0a0fa6383ec4a938846ed8c375b1 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Thu, 10 Feb 2022 10:56:03 +0100
Subject: [PATCH 21/27] some changes to fix some minor bugs in table_parsing.py
 and post_processing.py

---
 vidocp/table_parsing.py         |  8 +++-----
 vidocp/utils/post_processing.py | 17 ++++++++---------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 0131c3c..adaa210 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -76,12 +76,12 @@ def has_table_shape(rects):
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
-        #  was set too higg (3000): Boxes in a Table can definetly be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
-        #  with extra condition for the length of height and width weirdly narrow rectangles can be filtered
+        #  was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
+        #  with extra condition for the length of height and width, weirdly narrow rectangles can be filtered
         return area > 500 and w > 35 and h > 15
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    #changed threshold value from 150 to 200 b
+    #changed threshold value from 150 to 200 because of a shaded edgecase table
     th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
@@ -91,12 +91,10 @@ def parse_table(image: np.array):
     _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
 
     stats = np.vstack(list(filter(is_large_enough, stats)))
-    print(stats)
     rects = stats[:, :-1][2:]
 
     # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
     rects = list(remove_isolated(rects, input_sorted=True))
-    print(rects)
 
     # print(f"{has_table_shape(rects) = }")
     # if not has_table_shape(rects):
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 77f8cab..6cc9452 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -43,7 +43,6 @@ def adjacent(a, b):
     """
 
     def adjacent2d(g, h, i, j, k, l):
-        #print(abs(g-h), [k <= p <= l for p in [i, j]])
         return adjacent1d(g, h) and any(k <= p <= l for p in [i, j])
 
     if any(x is None for x in (a, b)):
@@ -51,17 +50,17 @@ def adjacent(a, b):
 
     v1 = a.xmin, a.ymin
     v2 = a.xmax, a.ymax
-    print("topleft and bottom right rec1", v1,v2)
+    #print("topleft and bottom right rec1", v1,v2)
     w1 = b.xmin, b.ymin
     w2 = b.xmax, b.ymax
-    print("topleft and bottom right rec2", w1, w2)
+    #print("topleft and bottom right rec2", w1, w2)
     # some rectangles are compared twice
-    print((
-            adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
-            adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
-            adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
-            adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
-        ))
+    # print((
+    #         adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
+    #         adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
+    #         adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
+    #         adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
+    #     ))
     return any(
         (
             adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),

From 885fc22f9de9b4e44d7657b117b260f0a774f091 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Fri, 11 Feb 2022 15:59:54 +0100
Subject: [PATCH 22/27] added changes to parse scanned pdfs

---
 .gitignore              |  1 +
 vidocp/table_parsing.py | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bac3af5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/pdfs/
diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index adaa210..455e9f3 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -23,6 +23,19 @@ def add_external_contours(image, img):
 
     return image
 
+def process_lines(img_bin_h, img_bin_v):
+    def draw_lines(lines, img_bin):
+        for line in lines:
+            for x1, y1, x2, y2 in line:
+                cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 3)
+                return img_bin
+    lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500, 700, 0)
+    draw_lines(lines_h, img_bin_h)
+
+    lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 700, 0)
+    draw_lines(lines_v,img_bin_v)
+
+    return img_bin_h, img_bin_v
 
 def isolate_vertical_and_horizontal_components(img_bin):
 
@@ -33,6 +46,18 @@ def isolate_vertical_and_horizontal_components(img_bin):
     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
 
+    #img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v)
+
+    # lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500)
+    # for line in lines_h:
+    #     for x1, y1, x2, y2 in line:
+    #         cv2.line(img_bin_h, (x1, y1), (x2, y2), (255, 255, 255), 3)
+    # lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 600, 0)
+    # for line in lines_v:
+    #     for x1, y1, x2, y2 in line:
+    #         cv2.line(img_bin_v, (x1, y1), (x2, y2), (255, 255, 255), 3)
+
+
     img_bin_final = img_bin_h | img_bin_v
 
     return img_bin_final
@@ -73,15 +98,21 @@ def has_table_shape(rects):
     )
 
 
+
+
+
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
-        #  was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
+        #  was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
         #  with extra condition for the length of height and width, weirdly narrow rectangles can be filtered
         return area > 500 and w > 35 and h > 15
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    #changed threshold value from 150 to 200 because of a shaded edgecase table
+    # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
+    # th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY)
+
+    # changed threshold value from 150 to 195 because of a shaded edgecase table
     th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
@@ -96,10 +127,6 @@ def parse_table(image: np.array):
     # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
     rects = list(remove_isolated(rects, input_sorted=True))
 
-    # print(f"{has_table_shape(rects) = }")
-    # if not has_table_shape(rects):
-    #     print(111111111111111111111)
-    #     return []
 
     return rects
 

From c2faf7d00bf4ed9d20d5d3422d172ae35d79643c Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Mon, 14 Feb 2022 11:04:04 +0100
Subject: [PATCH 23/27] adjusted isolation of vertical and horizontal
 components to be more robust to scanned pages; work in progress

---
 vidocp/table_parsing.py | 50 ++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 455e9f3..88a8790 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -10,6 +10,7 @@ from vidocp.utils.display import show_mpl
 from vidocp.utils.draw import draw_rectangles
 from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
 
+import matplotlib.pyplot as plt
 
 def add_external_contours(image, img):
 
@@ -27,12 +28,12 @@ def process_lines(img_bin_h, img_bin_v):
     def draw_lines(lines, img_bin):
         for line in lines:
             for x1, y1, x2, y2 in line:
-                cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 3)
+                cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 6)
                 return img_bin
-    lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500, 700, 0)
+    lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi / 180, 500, 500, 250)
     draw_lines(lines_h, img_bin_h)
 
-    lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 700, 0)
+    lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 500, 250)
     draw_lines(lines_v,img_bin_v)
 
     return img_bin_h, img_bin_v
@@ -46,20 +47,17 @@ def isolate_vertical_and_horizontal_components(img_bin):
     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
 
-    #img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v)
-
-    # lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500)
-    # for line in lines_h:
-    #     for x1, y1, x2, y2 in line:
-    #         cv2.line(img_bin_h, (x1, y1), (x2, y2), (255, 255, 255), 3)
-    # lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 600, 0)
-    # for line in lines_v:
-    #     for x1, y1, x2, y2 in line:
-    #         cv2.line(img_bin_v, (x1, y1), (x2, y2), (255, 255, 255), 3)
-
+    img_bin_h = cv2.dilate(img_bin_h, kernel_h, 1)
+    img_bin_v = cv2.dilate(img_bin_v, kernel_v, 1)
 
+    img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
+    img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
+    # img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v)
     img_bin_final = img_bin_h | img_bin_v
-
+    kernel = np.ones((5, 5), np.uint8)
+    # img_bin_final = cv2.dilate(img_bin_final, kernel, 2)
+    th1, img_bin_final = cv2.threshold(img_bin_final, 10, 255, cv2.THRESH_BINARY)
+    show_mpl(img_bin_final)
     return img_bin_final
 
 
@@ -99,21 +97,27 @@ def has_table_shape(rects):
 
 
 
+def apply_motion_blur(image, size, angle):
+    k = np.zeros((size, size), dtype=np.float32)
+    k[ (size-1)// 2 , :] = np.ones(size, dtype=np.float32)
+    k = cv2.warpAffine(k, cv2.getRotationMatrix2D( (size / 2 -0.5 , size / 2 -0.5 ) , angle, 1.0), (size, size) )
+    k = k * ( 1.0 / np.sum(k) )
+    return cv2.filter2D(image, -1, k)
 
 
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
         #  was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
-        #  with extra condition for the length of height and width, weirdly narrow rectangles can be filtered
+        #  with extra condition for the length of height and width weirdly narrow rectangles can be filtered
         return area > 500 and w > 35 and h > 15
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
-    # th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY)
-
+    blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
+    th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY)
+    show_mpl(img_bin)
     # changed threshold value from 150 to 195 because of a shaded edgecase table
-    th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
+    # th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
     img_bin = isolate_vertical_and_horizontal_components(img_bin)
@@ -127,6 +131,10 @@ def parse_table(image: np.array):
     # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
     rects = list(remove_isolated(rects, input_sorted=True))
 
+    # if not has_table_shape(rects):
+    #     return False
+
+
 
     return rects
 
@@ -138,5 +146,7 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
 
     stats = parse_table(page)
     page = draw_rectangles(page, stats, annotate=True)
+    # if stats:
+    #     page = draw_rectangles(page, stats, annotate=True)
 
     show_mpl(page)

From 57ca47f38d68fe56baee483b1296d11e8a99e58b Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Wed, 16 Feb 2022 12:37:17 +0100
Subject: [PATCH 24/27] different approaches to isolate line components of
 tables in scanned pdf files.

---
 .gitignore              |   1 +
 vidocp/table_parsing.py | 119 ++++++++++++++++++++++++++++------------
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index bac3af5..1cf261d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /pdfs/
+/results/
diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 88a8790..2301ac1 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -10,10 +10,8 @@ from vidocp.utils.display import show_mpl
 from vidocp.utils.draw import draw_rectangles
 from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
 
-import matplotlib.pyplot as plt
 
 def add_external_contours(image, img):
-
     contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
 
     # contours = filter(partial(is_large_enough, min_area=5000000), contours)
@@ -24,46 +22,107 @@ def add_external_contours(image, img):
 
     return image
 
-def process_lines(img_bin_h, img_bin_v):
-    def draw_lines(lines, img_bin):
-        for line in lines:
+
+def process_lines(img_line_component):
+    def draw_lines(detected_lines, img_bin):
+        for line in detected_lines:
             for x1, y1, x2, y2 in line:
                 cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 6)
                 return img_bin
-    lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi / 180, 500, 500, 250)
-    draw_lines(lines_h, img_bin_h)
 
-    lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 500, 250)
-    draw_lines(lines_v,img_bin_v)
+    lines = cv2.HoughLines(img_line_component, 1, np.pi / 180, 500)
+    draw_lines(lines, lines)
+
+    return img_line_component
+
+# def isolate_vertical_and_horizontal_components(img_bin):
+#     line_min_width = 50
+#     kernel_h = np.ones((1, line_min_width), np.uint8)
+#     kernel_v = np.ones((line_min_width, 1), np.uint8)
+#
+#     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
+#     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+#     show_mpl(img_bin_h | img_bin_v)
+#
+#     img_bin_h = apply_motion_blur(img_bin_h, 140, 0)
+#     img_bin_v = apply_motion_blur(img_bin_v, 140, 90)
+#     show_mpl(img_bin_h | img_bin_v)
+#
+#     th1, img_bin_h = cv2.threshold(img_bin_h, 95, 255, cv2.THRESH_BINARY)
+#     th1, img_bin_v = cv2.threshold(img_bin_v, 95, 255, cv2.THRESH_BINARY)
+#     show_mpl(img_bin_h | img_bin_v)
+#
+#     kernel_h = np.ones((1, 8), np.uint8)
+#     kernel_v = np.ones((8, 1), np.uint8)
+#     img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=4)
+#     img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=4)
+#
+#     img_bin_final = img_bin_h | img_bin_v
+#     show_mpl(img_bin_final)
+#     # th 130
+#     #th1, img_bin_final = cv2.threshold(img_bin_final, 90, 255, cv2.THRESH_BINARY)
+#     #show_mpl(img_bin_final)
+#     return img_bin_final
 
-    return img_bin_h, img_bin_v
 
 def isolate_vertical_and_horizontal_components(img_bin):
-
     line_min_width = 30
     kernel_h = np.ones((1, line_min_width), np.uint8)
     kernel_v = np.ones((line_min_width, 1), np.uint8)
 
     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+    show_mpl(img_bin_h | img_bin_v)
 
-    img_bin_h = cv2.dilate(img_bin_h, kernel_h, 1)
-    img_bin_v = cv2.dilate(img_bin_v, kernel_v, 1)
+    img_bin_h = apply_motion_blur(img_bin_h, 150, 0)
+    img_bin_v = apply_motion_blur(img_bin_v, 150, 90)
+    show_mpl(img_bin_h | img_bin_v)
+
+    th1, img_bin_h = cv2.threshold(img_bin_h, 70, 255, cv2.THRESH_BINARY)
+    th1, img_bin_v = cv2.threshold(img_bin_v, 70, 255, cv2.THRESH_BINARY)
+    show_mpl(img_bin_h | img_bin_v)
+
+    kernel_h = np.ones((1, 10), np.uint8)
+    kernel_v = np.ones((10, 1), np.uint8)
+    img_bin_h = cv2.erode(img_bin_h, kernel_h, iterations=1)
+    img_bin_v = cv2.erode(img_bin_v, kernel_v, iterations=1)
 
-    img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
-    img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
-    # img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v)
     img_bin_final = img_bin_h | img_bin_v
-    kernel = np.ones((5, 5), np.uint8)
-    # img_bin_final = cv2.dilate(img_bin_final, kernel, 2)
-    th1, img_bin_final = cv2.threshold(img_bin_final, 10, 255, cv2.THRESH_BINARY)
     show_mpl(img_bin_final)
+    # th 130
+    # th1, img_bin_final = cv2.threshold(img_bin_final, 150, 255, cv2.THRESH_BINARY)
+    # show_mpl(img_bin_final)
     return img_bin_final
 
+# def isolate_vertical_and_horizontal_components(img_bin):
+#     line_min_width = 30
+#     kernel_h = np.ones((1, line_min_width), np.uint8)
+#     kernel_v = np.ones((line_min_width, 1), np.uint8)
+#
+#     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
+#     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
+#     show_mpl(img_bin_h | img_bin_v)
+#
+#     kernel_h = np.ones((1, 30), np.uint8)
+#     kernel_v = np.ones((30, 1), np.uint8)
+#     img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=1)
+#     img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=1)
+#     show_mpl(img_bin_h | img_bin_v)
+#
+#     img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
+#     img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
+#
+#     img_bin_final = img_bin_h | img_bin_v
+#     show_mpl(img_bin_final)
+#     # th 130
+#     th1, img_bin_final = cv2.threshold(img_bin_final, 125, 255, cv2.THRESH_BINARY)
+#     show_mpl(img_bin_final)
+#
+#     return img_bin_final
+
 
 # FIXME: does not work yet
 def has_table_shape(rects):
-
     assert isinstance(rects, list)
 
     points = list(chain(*map(xywh_to_vecs, rects)))
@@ -96,29 +155,24 @@ def has_table_shape(rects):
     )
 
 
-
 def apply_motion_blur(image, size, angle):
     k = np.zeros((size, size), dtype=np.float32)
-    k[ (size-1)// 2 , :] = np.ones(size, dtype=np.float32)
-    k = cv2.warpAffine(k, cv2.getRotationMatrix2D( (size / 2 -0.5 , size / 2 -0.5 ) , angle, 1.0), (size, size) )
-    k = k * ( 1.0 / np.sum(k) )
+    k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
+    k = cv2.warpAffine(k, cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0), (size, size))
+    k = k * (1.0 / np.sum(k))
     return cv2.filter2D(image, -1, k)
 
 
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
-        #  was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
-        #  with extra condition for the length of height and width weirdly narrow rectangles can be filtered
         return area > 500 and w > 35 and h > 15
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
-    th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY)
-    show_mpl(img_bin)
-    # changed threshold value from 150 to 195 because of a shaded edgecase table
-    # th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
+    # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
+    th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
+    show_mpl(img_bin)
 
     img_bin = isolate_vertical_and_horizontal_components(img_bin)
     img_bin_final = add_external_contours(img_bin, img_bin)
@@ -134,13 +188,10 @@ def parse_table(image: np.array):
     # if not has_table_shape(rects):
     #     return False
 
-
-
     return rects
 
 
 def annotate_tables_in_pdf(pdf_path, page_index=1):
-
     page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
     page = np.array(page)
 

From d70781f4aa8b3dd6286151d2cf85a85fe82e1caa Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Thu, 17 Feb 2022 16:45:55 +0100
Subject: [PATCH 25/27] changed tolerance in adjacent1 function in
 postprocessing.y from 2 to 4 added function so vertical and horizontal
 components do not overlap the layout box of the table

---
 vidocp/table_parsing.py         | 120 ++++++++++----------------------
 vidocp/utils/post_processing.py |   2 +-
 2 files changed, 36 insertions(+), 86 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index 2301ac1..dd65cd2 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -9,6 +9,7 @@ from pdf2image import pdf2image
 from vidocp.utils.display import show_mpl
 from vidocp.utils.draw import draw_rectangles
 from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
+from vidocp.layout_parsing import parse_layout
 
 
 def add_external_contours(image, img):
@@ -23,50 +24,8 @@ def add_external_contours(image, img):
     return image
 
 
-def process_lines(img_line_component):
-    def draw_lines(detected_lines, img_bin):
-        for line in detected_lines:
-            for x1, y1, x2, y2 in line:
-                cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 6)
-                return img_bin
-
-    lines = cv2.HoughLines(img_line_component, 1, np.pi / 180, 500)
-    draw_lines(lines, lines)
-
-    return img_line_component
-
-# def isolate_vertical_and_horizontal_components(img_bin):
-#     line_min_width = 50
-#     kernel_h = np.ones((1, line_min_width), np.uint8)
-#     kernel_v = np.ones((line_min_width, 1), np.uint8)
-#
-#     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
-#     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-#     show_mpl(img_bin_h | img_bin_v)
-#
-#     img_bin_h = apply_motion_blur(img_bin_h, 140, 0)
-#     img_bin_v = apply_motion_blur(img_bin_v, 140, 90)
-#     show_mpl(img_bin_h | img_bin_v)
-#
-#     th1, img_bin_h = cv2.threshold(img_bin_h, 95, 255, cv2.THRESH_BINARY)
-#     th1, img_bin_v = cv2.threshold(img_bin_v, 95, 255, cv2.THRESH_BINARY)
-#     show_mpl(img_bin_h | img_bin_v)
-#
-#     kernel_h = np.ones((1, 8), np.uint8)
-#     kernel_v = np.ones((8, 1), np.uint8)
-#     img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=4)
-#     img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=4)
-#
-#     img_bin_final = img_bin_h | img_bin_v
-#     show_mpl(img_bin_final)
-#     # th 130
-#     #th1, img_bin_final = cv2.threshold(img_bin_final, 90, 255, cv2.THRESH_BINARY)
-#     #show_mpl(img_bin_final)
-#     return img_bin_final
-
-
-def isolate_vertical_and_horizontal_components(img_bin):
-    line_min_width = 30
+def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
+    line_min_width = 47
     kernel_h = np.ones((1, line_min_width), np.uint8)
     kernel_v = np.ones((line_min_width, 1), np.uint8)
 
@@ -74,51 +33,32 @@ def isolate_vertical_and_horizontal_components(img_bin):
     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
     show_mpl(img_bin_h | img_bin_v)
 
-    img_bin_h = apply_motion_blur(img_bin_h, 150, 0)
-    img_bin_v = apply_motion_blur(img_bin_v, 150, 90)
+    kernel_h = np.ones((1, 30), np.uint8)
+    kernel_v = np.ones((30, 1), np.uint8)
+    img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
+    img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
     show_mpl(img_bin_h | img_bin_v)
 
-    th1, img_bin_h = cv2.threshold(img_bin_h, 70, 255, cv2.THRESH_BINARY)
-    th1, img_bin_v = cv2.threshold(img_bin_v, 70, 255, cv2.THRESH_BINARY)
-    show_mpl(img_bin_h | img_bin_v)
-
-    kernel_h = np.ones((1, 10), np.uint8)
-    kernel_v = np.ones((10, 1), np.uint8)
-    img_bin_h = cv2.erode(img_bin_h, kernel_h, iterations=1)
-    img_bin_v = cv2.erode(img_bin_v, kernel_v, iterations=1)
+    img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
+    img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
 
     img_bin_final = img_bin_h | img_bin_v
     show_mpl(img_bin_final)
-    # th 130
-    # th1, img_bin_final = cv2.threshold(img_bin_final, 150, 255, cv2.THRESH_BINARY)
-    # show_mpl(img_bin_final)
+
+    th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY)
+    img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
+    show_mpl(img_bin_final)
+    img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects)
+    show_mpl(img_bin_final)
+
     return img_bin_final
 
-# def isolate_vertical_and_horizontal_components(img_bin):
-#     line_min_width = 30
-#     kernel_h = np.ones((1, line_min_width), np.uint8)
-#     kernel_v = np.ones((line_min_width, 1), np.uint8)
-#
-#     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
-#     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
-#     show_mpl(img_bin_h | img_bin_v)
-#
-#     kernel_h = np.ones((1, 30), np.uint8)
-#     kernel_v = np.ones((30, 1), np.uint8)
-#     img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=1)
-#     img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=1)
-#     show_mpl(img_bin_h | img_bin_v)
-#
-#     img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
-#     img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
-#
-#     img_bin_final = img_bin_h | img_bin_v
-#     show_mpl(img_bin_final)
-#     # th 130
-#     th1, img_bin_final = cv2.threshold(img_bin_final, 125, 255, cv2.THRESH_BINARY)
-#     show_mpl(img_bin_final)
-#
-#     return img_bin_final
+
+def disconnect_non_existing_cells(img_bin, bounding_rects):
+    for rect in bounding_rects:
+        x, y, w, h = rect
+        img_bin = cv2.rectangle(img_bin, (x, y), (x + w, y + h), (0, 0, 0), 5)
+    return img_bin
 
 
 # FIXME: does not work yet
@@ -163,18 +103,28 @@ def apply_motion_blur(image, size, angle):
     return cv2.filter2D(image, -1, k)
 
 
+def find_table_layout_boxes(image: np.array):
+    layout_boxes = parse_layout(image)
+    table_boxes = []
+    for box in layout_boxes:
+        (x, y, w, h) = box
+        if w * h >= 300000:
+            table_boxes.append(box)
+    return table_boxes
+
+
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
-        return area > 500 and w > 35 and h > 15
+        return area > 2000 and w > 35 and h > 25
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
     th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
     show_mpl(img_bin)
 
-    img_bin = isolate_vertical_and_horizontal_components(img_bin)
+    table_layout_boxes = find_table_layout_boxes(image)
+    img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes)
     img_bin_final = add_external_contours(img_bin, img_bin)
 
     _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 6cc9452..02018c3 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -30,7 +30,7 @@ def remove_included(rectangles):
     return rectangles
 
 #tolerance was set too low (1) most lines are 2px wide
-def adjacent1d(n, m, tolerance=2):
+def adjacent1d(n, m, tolerance=4):
     return abs(n - m) <= tolerance
 
 

From 723c6606e1a3e2d192e9bfbb1ebc1e56c2cf8fe6 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Fri, 18 Feb 2022 16:35:50 +0100
Subject: [PATCH 26/27] kernel size for morphology ex set bit higher, so less
 non-table structure are detected. Reduced the kernel size of the directional
 motion blurr and increased the treshold a little bit so narrow cells wont be
 split up.

Problem with the cell filtering for certain scanned pdfs detected.
---
 vidocp/table_parsing.py         | 14 ++++++++------
 vidocp/utils/post_processing.py | 14 ++------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index dd65cd2..c43a457 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -25,7 +25,7 @@ def add_external_contours(image, img):
 
 
 def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
-    line_min_width = 47
+    line_min_width = 48
     kernel_h = np.ones((1, line_min_width), np.uint8)
     kernel_v = np.ones((line_min_width, 1), np.uint8)
 
@@ -39,15 +39,17 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
     img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
     show_mpl(img_bin_h | img_bin_v)
 
-    img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
-    img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
+    #reduced filtersize from 100 to 80 to minimize splitting narrow cells
+    img_bin_h = apply_motion_blur(img_bin_h, 80, 0)
+    img_bin_v = apply_motion_blur(img_bin_v, 80, 90)
 
     img_bin_final = img_bin_h | img_bin_v
     show_mpl(img_bin_final)
-
-    th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY)
+    #changed threshold from 110 to 120 to minimize cell splitting
+    th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
     img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
     show_mpl(img_bin_final)
+    # problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22
     img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects)
     show_mpl(img_bin_final)
 
@@ -108,7 +110,7 @@ def find_table_layout_boxes(image: np.array):
     table_boxes = []
     for box in layout_boxes:
         (x, y, w, h) = box
-        if w * h >= 300000:
+        if w * h >= 100000:
             table_boxes.append(box)
     return table_boxes
 
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 02018c3..79626d2 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -50,17 +50,10 @@ def adjacent(a, b):
 
     v1 = a.xmin, a.ymin
     v2 = a.xmax, a.ymax
-    #print("topleft and bottom right rec1", v1,v2)
+
     w1 = b.xmin, b.ymin
     w2 = b.xmax, b.ymax
-    #print("topleft and bottom right rec2", w1, w2)
-    # some rectangles are compared twice
-    # print((
-    #         adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
-    #         adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
-    #         adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
-    #         adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
-    #     ))
+
     return any(
         (
             adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
@@ -86,9 +79,7 @@ def __remove_isolated_unsorted(rectangles):
 def __remove_isolated_sorted(rectangles):
     def is_connected(left, center, right):
         return any(starmap(adjacent, [(left, center), (center, right)]))
-
     rectangles = list(map(xywh_to_vec_rect, rectangles))
-    # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles)
 
     lefts = [None, *rectangles[:-1]]
     rights = [*rectangles[1:], None]
@@ -96,7 +87,6 @@ def __remove_isolated_sorted(rectangles):
     mask = starmap(is_connected, zip(lefts, rectangles, rights))
     rectangles = compress(rectangles, mask)
     rectangles = map(vec_rect_to_xywh, rectangles)
-
     return rectangles
 
 

From 2a68e1b221881c1598864c719fe583d3b00a227c Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Fri, 18 Feb 2022 16:36:25 +0100
Subject: [PATCH 27/27] kernel size for morphology ex set bit higher, so less
 non-table structure are detected. Reduced the kernel size of the directional
 motion blurr and increased the treshold a little bit so narrow cells wont be
 split up.

Problem with the cell filtering for certain scanned pdfs detected.
---
 vidocp/utils/post_processing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 79626d2..a3a04b1 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -43,6 +43,7 @@ def adjacent(a, b):
     """
 
     def adjacent2d(g, h, i, j, k, l):
+        #print(adjacent1d(g, h) and any(k <= p <= l for p in [i, j]))
         return adjacent1d(g, h) and any(k <= p <= l for p in [i, j])
 
     if any(x is None for x in (a, b)):
@@ -78,6 +79,7 @@ def __remove_isolated_unsorted(rectangles):
 
 def __remove_isolated_sorted(rectangles):
     def is_connected(left, center, right):
+        # print(left,center,right, list(starmap(adjacent, [(left, center), (center, right)])))
         return any(starmap(adjacent, [(left, center), (center, right)]))
     rectangles = list(map(xywh_to_vec_rect, rectangles))