From d555e86475e82024f8e1a5fc5b0ac70faa091ee1 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:24:04 +0100 Subject: [PATCH 01/27] refactored figure detection once --- vidocp/figure_detection.py | 49 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index e852646..af68835 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -18,11 +18,29 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio) -def detect_figures(image: np.array): +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + + References: + https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background + """ + + def filter_likely_primary_text_segments(cnts): + for c in cnts: + area = cv2.contourArea(c) + if area > 800 and area < 15000: + yield cv2.boundingRect(c) image = image.copy() gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) @@ -33,16 +51,19 @@ def detect_figures(image: np.array): cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - def filter_rects(): - for c in cnts: - area = cv2.contourArea(c) - if area > 800 and area < 15000: - yield cv2.boundingRect(c) - - for rect in filter_rects(): + for rect in filter_likely_primary_text_segments(cnts): x, y, w, h = rect cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) + return image + + +def __detect_large_coherent_structures(image: np.array): + """Detects large coherent structures on an image. + + References: + https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection + """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] @@ -55,8 +76,18 @@ def detect_figures(image: np.array): cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + return cnts + + +def detect_figures(image: np.array): + + image = image.copy() + + image = remove_primary_text_regions(image) + cnts = __detect_large_coherent_structures(image) + cnts = filter(is_likely_figure, cnts) - rects = [cv2.boundingRect(c) for c in cnts] + rects = map(cv2.boundingRect, cnts) rects = remove_included(rects) return rects From c9780a57e5a048529d36958ba678eddb11759cef Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:24:41 +0100 Subject: [PATCH 02/27] removed obsolete import --- vidocp/figure_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index af68835..46e5484 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -2,7 +2,7 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import draw_contours, show_mpl, draw_rectangles, remove_included, remove_overlapping, show_cv2 +from vidocp.utils import show_mpl, draw_rectangles, remove_included def is_large_enough(cont, min_area=10000): From 504cafbd5d4bba183d9943b36c60548aae34e402 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:25:44 +0100 Subject: [PATCH 03/27] renaming --- scripts/annotate.py | 4 ++-- vidocp/figure_detection.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/annotate.py b/scripts/annotate.py index 10d40cc..682b8ad 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -3,7 +3,7 @@ import argparse from vidocp.table_parsing import annotate_tables_in_pdf from vidocp.redaction_detection import annotate_boxes_in_pdf from vidocp.layout_parsing import annotate_layout_in_pdf -from vidocp.figure_detection import remove_text_in_pdf +from vidocp.figure_detection import detect_figures_in_pdf def parse_args(): @@ -26,4 +26,4 @@ if __name__ == "__main__": elif args.type == "layout": annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "figure": - remove_text_in_pdf(args.pdf_path, page_index=args.page_index) + detect_figures_in_pdf(args.pdf_path, page_index=args.page_index) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 46e5484..f063f1a 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -93,7 +93,7 @@ def detect_figures(image: np.array): return rects -def remove_text_in_pdf(pdf_path, page_index=1): +def detect_figures_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) From fed3a7e4f1b8b7ca4e14f9e495459c26490fb50b Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:26:16 +0100 Subject: [PATCH 04/27] refactoring --- vidocp/figure_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index f063f1a..b5fd38b 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -34,7 +34,7 @@ def remove_primary_text_regions(image): def filter_likely_primary_text_segments(cnts): for c in cnts: area = cv2.contourArea(c) - if area > 800 and area < 15000: + if 800 < area < 15000: yield cv2.boundingRect(c) image = image.copy() From 98d77cb522a08821c3a13ae2cffbe7239c654762 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:27:55 +0100 Subject: [PATCH 05/27] refactoring --- vidocp/figure_detection.py | 64 ++------------------------------------ vidocp/utils.py | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index b5fd38b..830fc97 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -2,7 +2,8 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import show_mpl, draw_rectangles, remove_included +from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \ + __detect_large_coherent_structures def is_large_enough(cont, min_area=10000): @@ -18,67 +19,6 @@ def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio) -def remove_primary_text_regions(image): - """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. - - Args: - image: Image to remove primary text from. - - Returns: - Image with primary text removed. - - References: - https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background - """ - - def filter_likely_primary_text_segments(cnts): - for c in cnts: - area = cv2.contourArea(c) - if 800 < area < 15000: - yield cv2.boundingRect(c) - - image = image.copy() - - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] - - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) - close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) - - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) - dilate = cv2.dilate(close, dilate_kernel, iterations=1) - - cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - - for rect in filter_likely_primary_text_segments(cnts): - x, y, w, h = rect - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) - - return image - - -def __detect_large_coherent_structures(image: np.array): - """Detects large coherent structures on an image. - - References: - https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection - """ - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] - - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) - dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) - - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) - close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) - - cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - return cnts - - def detect_figures(image: np.array): image = image.copy() diff --git a/vidocp/utils.py b/vidocp/utils.py index ee528b4..8b9235f 100644 --- a/vidocp/utils.py +++ b/vidocp/utils.py @@ -2,6 +2,7 @@ from collections import namedtuple from functools import partial import cv2 +import numpy as np from matplotlib import pyplot as plt @@ -141,3 +142,64 @@ def vec_rect_to_xywh(rect): w = x2 - x h = y2 - y return x, y, w, h + + +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + + References: + https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background + """ + + def filter_likely_primary_text_segments(cnts): + for c in cnts: + area = cv2.contourArea(c) + if 800 < area < 15000: + yield cv2.boundingRect(c) + + image = image.copy() + + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) + close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) + dilate = cv2.dilate(close, dilate_kernel, iterations=1) + + cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + for rect in filter_likely_primary_text_segments(cnts): + x, y, w, h = rect + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) + + return image + + +def __detect_large_coherent_structures(image: np.array): + """Detects large coherent structures on an image. + + References: + https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection + """ + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) + dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) + close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + return cnts \ No newline at end of file From aa66b6865b00b0490b9e7695a6bae386e6f96723 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:31:21 +0100 Subject: [PATCH 06/27] refactoring --- vidocp/figure_detection.py | 11 +---------- vidocp/redaction_detection.py | 17 +---------------- vidocp/table_parsing.py | 4 ++-- vidocp/utils.py | 22 +++++++++++++++++++++- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 830fc97..5cb44ca 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -3,16 +3,7 @@ import numpy as np from pdf2image import pdf2image from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \ - __detect_large_coherent_structures - - -def is_large_enough(cont, min_area=10000): - return cv2.contourArea(cont, False) > min_area - - -def has_acceptable_format(cont, max_width_to_hight_ratio=6): - _, _, w, h = cv2.boundingRect(cont) - return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio) + __detect_large_coherent_structures, is_large_enough, has_acceptable_format def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index f1b319a..31cb3b1 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -5,22 +5,7 @@ import numpy as np import pdf2image from iteration_utilities import starfilter, first -from vidocp.utils import show_mpl, draw_contours - - -def is_filled(hierarchy): - # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv - return hierarchy[3] <= 0 and hierarchy[2] == -1 - - -def is_boxy(contour): - epsilon = 0.01 * cv2.arcLength(contour, True) - approx = cv2.approxPolyDP(contour, epsilon, True) - return len(approx) <= 10 - - -def is_large_enough(contour, min_area): - return cv2.contourArea(contour, False) > min_area +from vidocp.utils import show_mpl, draw_contours, is_large_enough, is_filled, is_boxy def is_likely_redaction(contour, hierarchy, min_area): diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 765fb1c..035f569 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -2,7 +2,7 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import show_cv2, draw_stats +from vidocp.utils import draw_stats, show_mpl def add_external_contours(image, img): @@ -52,4 +52,4 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): stats = parse_table(page) page = draw_stats(page, stats) - show_cv2(page) + show_mpl(page) diff --git a/vidocp/utils.py b/vidocp/utils.py index 8b9235f..752cd3f 100644 --- a/vidocp/utils.py +++ b/vidocp/utils.py @@ -202,4 +202,24 @@ def __detect_large_coherent_structures(image: np.array): cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - return cnts \ No newline at end of file + return cnts + + +def is_large_enough(cont, min_area): + return cv2.contourArea(cont, False) > min_area + + +def has_acceptable_format(cont, max_width_to_hight_ratio): + _, _, w, h = cv2.boundingRect(cont) + return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio) + + +def is_filled(hierarchy): + # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + return hierarchy[3] <= 0 and hierarchy[2] == -1 + + +def is_boxy(contour): + epsilon = 0.01 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + return len(approx) <= 10 \ No newline at end of file From 89a99d3586db4fbafa743a45bdd02eaf0c1f341f Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:39:49 +0100 Subject: [PATCH 07/27] refactoring --- vidocp/utils.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vidocp/utils.py b/vidocp/utils.py index 752cd3f..a7a50a4 100644 --- a/vidocp/utils.py +++ b/vidocp/utils.py @@ -157,11 +157,8 @@ def remove_primary_text_regions(image): https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background """ - def filter_likely_primary_text_segments(cnts): - for c in cnts: - area = cv2.contourArea(c) - if 800 < area < 15000: - yield cv2.boundingRect(c) + def is_likely_primary_text_segments(cnt): + return 800 < cv2.contourArea(cnt) < 15000 image = image.copy() @@ -177,8 +174,8 @@ def remove_primary_text_regions(image): cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - for rect in filter_likely_primary_text_segments(cnts): - x, y, w, h = rect + for cnt in filter(is_likely_primary_text_segments, cnts): + x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) return image From e8863d67aaaff138fb088c4e496a91b6354cc059 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:42:45 +0100 Subject: [PATCH 08/27] refactoring --- vidocp/figure_detection.py | 11 +++++++++-- vidocp/utils.py | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 5cb44ca..2bf77db 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -2,8 +2,15 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \ - __detect_large_coherent_structures, is_large_enough, has_acceptable_format +from vidocp.utils import ( + show_mpl, + draw_rectangles, + remove_included, + remove_primary_text_regions, + __detect_large_coherent_structures, + is_large_enough, + has_acceptable_format, +) def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): diff --git a/vidocp/utils.py b/vidocp/utils.py index a7a50a4..ff61135 100644 --- a/vidocp/utils.py +++ b/vidocp/utils.py @@ -144,8 +144,8 @@ def vec_rect_to_xywh(rect): return x, y, w, h -def remove_primary_text_regions(image): - """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. +def find_primary_text_regions(image): + """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. Args: image: Image to remove primary text from. @@ -174,7 +174,26 @@ def remove_primary_text_regions(image): cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - for cnt in filter(is_likely_primary_text_segments, cnts): + cnts = filter(is_likely_primary_text_segments, cnts) + + return cnts + + +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + """ + + image = image.copy() + + cnts = find_primary_text_regions(image) + + for cnt in cnts: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) @@ -206,17 +225,21 @@ def is_large_enough(cont, min_area): return cv2.contourArea(cont, False) > min_area -def has_acceptable_format(cont, max_width_to_hight_ratio): +def has_acceptable_format(cont, max_width_to_height_ratio): _, _, w, h = cv2.boundingRect(cont) - return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio) + return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio) def is_filled(hierarchy): - # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """Checks whether a hierarchy is filled. + + References: + https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """ return hierarchy[3] <= 0 and hierarchy[2] == -1 def is_boxy(contour): epsilon = 0.01 * cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, epsilon, True) - return len(approx) <= 10 \ No newline at end of file + return len(approx) <= 10 From 9d30009dceec0357db6499bfaffae8ce97718ee0 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:45:53 +0100 Subject: [PATCH 09/27] refactoring --- vidocp/figure_detection.py | 4 ++-- vidocp/utils/__init__.py | 1 + vidocp/{ => utils}/utils.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 vidocp/utils/__init__.py rename vidocp/{ => utils}/utils.py (99%) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 2bf77db..2c58968 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -7,7 +7,7 @@ from vidocp.utils import ( draw_rectangles, remove_included, remove_primary_text_regions, - __detect_large_coherent_structures, + detect_large_coherent_structures, is_large_enough, has_acceptable_format, ) @@ -22,7 +22,7 @@ def detect_figures(image: np.array): image = image.copy() image = remove_primary_text_regions(image) - cnts = __detect_large_coherent_structures(image) + cnts = detect_large_coherent_structures(image) cnts = filter(is_likely_figure, cnts) rects = map(cv2.boundingRect, cnts) diff --git a/vidocp/utils/__init__.py b/vidocp/utils/__init__.py new file mode 100644 index 0000000..90f60fd --- /dev/null +++ b/vidocp/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/vidocp/utils.py b/vidocp/utils/utils.py similarity index 99% rename from vidocp/utils.py rename to vidocp/utils/utils.py index ff61135..1802fca 100644 --- a/vidocp/utils.py +++ b/vidocp/utils/utils.py @@ -200,7 +200,7 @@ def remove_primary_text_regions(image): return image -def __detect_large_coherent_structures(image: np.array): +def detect_large_coherent_structures(image: np.array): """Detects large coherent structures on an image. References: From d9567da428c81f9cd7971a657281df0a90166810 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:47:18 +0100 Subject: [PATCH 10/27] refactoring --- vidocp/figure_detection.py | 2 +- vidocp/utils/text.py | 57 ++++++++++++++++++++++++++++++++++++++ vidocp/utils/utils.py | 56 ------------------------------------- 3 files changed, 58 insertions(+), 57 deletions(-) create mode 100644 vidocp/utils/text.py diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 2c58968..42ded9b 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -6,11 +6,11 @@ from vidocp.utils import ( show_mpl, draw_rectangles, remove_included, - remove_primary_text_regions, detect_large_coherent_structures, is_large_enough, has_acceptable_format, ) +from vidocp.utils.text import remove_primary_text_regions def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py new file mode 100644 index 0000000..3a7d2b1 --- /dev/null +++ b/vidocp/utils/text.py @@ -0,0 +1,57 @@ +import cv2 + + +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + """ + + image = image.copy() + + cnts = find_primary_text_regions(image) + + for cnt in cnts: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) + + return image + + +def find_primary_text_regions(image): + """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + + References: + https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background + """ + + def is_likely_primary_text_segments(cnt): + return 800 < cv2.contourArea(cnt) < 15000 + + image = image.copy() + + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) + close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) + dilate = cv2.dilate(close, dilate_kernel, iterations=1) + + cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + cnts = filter(is_likely_primary_text_segments, cnts) + + return cnts \ No newline at end of file diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py index 1802fca..2121804 100644 --- a/vidocp/utils/utils.py +++ b/vidocp/utils/utils.py @@ -144,62 +144,6 @@ def vec_rect_to_xywh(rect): return x, y, w, h -def find_primary_text_regions(image): - """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. - - Args: - image: Image to remove primary text from. - - Returns: - Image with primary text removed. - - References: - https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background - """ - - def is_likely_primary_text_segments(cnt): - return 800 < cv2.contourArea(cnt) < 15000 - - image = image.copy() - - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] - - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) - close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) - - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) - dilate = cv2.dilate(close, dilate_kernel, iterations=1) - - cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - - cnts = filter(is_likely_primary_text_segments, cnts) - - return cnts - - -def remove_primary_text_regions(image): - """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. - - Args: - image: Image to remove primary text from. - - Returns: - Image with primary text removed. - """ - - image = image.copy() - - cnts = find_primary_text_regions(image) - - for cnt in cnts: - x, y, w, h = cv2.boundingRect(cnt) - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) - - return image - - def detect_large_coherent_structures(image: np.array): """Detects large coherent structures on an image. From e652da1fa88a048f9a5211b4e8c0b96074fb5849 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:53:17 +0100 Subject: [PATCH 11/27] refactoring --- vidocp/figure_detection.py | 13 +-- vidocp/layout_parsing.py | 4 +- vidocp/redaction_detection.py | 4 +- vidocp/table_parsing.py | 3 +- vidocp/utils/__init__.py | 2 +- vidocp/utils/detection.py | 23 +++++ vidocp/utils/display.py | 16 +++ vidocp/utils/draw.py | 56 ++++++++++ vidocp/utils/filters.py | 25 +++++ vidocp/utils/post_processing.py | 62 +++++++++++ vidocp/utils/text.py | 2 +- vidocp/utils/utils.py | 177 -------------------------------- 12 files changed, 197 insertions(+), 190 deletions(-) create mode 100644 vidocp/utils/detection.py create mode 100644 vidocp/utils/display.py create mode 100644 vidocp/utils/draw.py create mode 100644 vidocp/utils/filters.py create mode 100644 vidocp/utils/post_processing.py diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 42ded9b..27a8eb2 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -2,14 +2,11 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import ( - show_mpl, - draw_rectangles, - remove_included, - detect_large_coherent_structures, - is_large_enough, - has_acceptable_format, -) +from vidocp.utils.detection import detect_large_coherent_structures +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import remove_included +from vidocp.utils.filters import is_large_enough, has_acceptable_format from vidocp.utils.text import remove_primary_text_regions diff --git a/vidocp/layout_parsing.py b/vidocp/layout_parsing.py index 67cd89e..b5f1c51 100644 --- a/vidocp/layout_parsing.py +++ b/vidocp/layout_parsing.py @@ -6,7 +6,9 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import draw_rectangles, show_mpl, remove_overlapping, remove_included, has_no_parent +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import remove_overlapping, remove_included, has_no_parent def is_likely_segment(rect, min_area=100): diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index 31cb3b1..1843f60 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -5,7 +5,9 @@ import numpy as np import pdf2image from iteration_utilities import starfilter, first -from vidocp.utils import show_mpl, draw_contours, is_large_enough, is_filled, is_boxy +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_contours +from vidocp.utils.filters import is_large_enough, is_filled, is_boxy def is_likely_redaction(contour, hierarchy, min_area): diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 035f569..c991d43 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -2,7 +2,8 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import draw_stats, show_mpl +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_stats def add_external_contours(image, img): diff --git a/vidocp/utils/__init__.py b/vidocp/utils/__init__.py index 90f60fd..16281fe 100644 --- a/vidocp/utils/__init__.py +++ b/vidocp/utils/__init__.py @@ -1 +1 @@ -from .utils import * \ No newline at end of file +from .utils import * diff --git a/vidocp/utils/detection.py b/vidocp/utils/detection.py new file mode 100644 index 0000000..e5d8266 --- /dev/null +++ b/vidocp/utils/detection.py @@ -0,0 +1,23 @@ +import cv2 +import numpy as np + + +def detect_large_coherent_structures(image: np.array): + """Detects large coherent structures on an image. + + References: + https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection + """ + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) + dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) + close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + return cnts diff --git a/vidocp/utils/display.py b/vidocp/utils/display.py new file mode 100644 index 0000000..e0cb8ab --- /dev/null +++ b/vidocp/utils/display.py @@ -0,0 +1,16 @@ +import cv2 +from matplotlib import pyplot as plt + + +def show_mpl(image): + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(image) + plt.show() + + +def show_cv2(image): + + cv2.imshow("", image) + cv2.waitKey(0) diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py new file mode 100644 index 0000000..32c66f6 --- /dev/null +++ b/vidocp/utils/draw.py @@ -0,0 +1,56 @@ +import cv2 + +from vidocp.utils import copy_and_normalize_channels + + +def draw_contours(image, contours): + + image = copy_and_normalize_channels(image) + + for cont in contours: + cv2.drawContours(image, cont, -1, (0, 255, 0), 4) + + return image + + +def draw_rectangles(image, rectangles, color=None): + + image = copy_and_normalize_channels(image) + + if not color: + color = (0, 255, 0) + + for rect in rectangles: + x, y, w, h = rect + cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) + + return image + + +def draw_stats(image, stats, annotate=False): + + image = copy_and_normalize_channels(image) + + keys = ["x", "y", "w", "h"] + + def annotate_stat(x, y, w, h): + + for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) + + def draw_stat(stat): + + x, y, w, h, area = stat + + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) + + if annotate: + annotate_stat(x, y, w, h) + + for stat in stats[2:]: + draw_stat(stat) + + return image diff --git a/vidocp/utils/filters.py b/vidocp/utils/filters.py new file mode 100644 index 0000000..274925c --- /dev/null +++ b/vidocp/utils/filters.py @@ -0,0 +1,25 @@ +import cv2 + + +def is_large_enough(cont, min_area): + return cv2.contourArea(cont, False) > min_area + + +def has_acceptable_format(cont, max_width_to_height_ratio): + _, _, w, h = cv2.boundingRect(cont) + return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio) + + +def is_filled(hierarchy): + """Checks whether a hierarchy is filled. + + References: + https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """ + return hierarchy[3] <= 0 and hierarchy[2] == -1 + + +def is_boxy(contour): + epsilon = 0.01 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + return len(approx) <= 10 diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py new file mode 100644 index 0000000..0cd7f62 --- /dev/null +++ b/vidocp/utils/post_processing.py @@ -0,0 +1,62 @@ +from collections import namedtuple +from functools import partial + + +def remove_overlapping(rectangles): + def overlap(a, b): + return compute_intersection(a, b) > 0 + + def does_not_overlap(rect, rectangles): + return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2) + + rectangles = list(map(xywh_to_vec_rect, rectangles)) + rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles) + rectangles = map(vec_rect_to_xywh, rectangles) + return rectangles + + +def remove_included(rectangles): + def included(a, b): + return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax + + def is_not_included(rect, rectangles): + return not any(included(r2, rect) for r2 in rectangles if not rect == r2) + + rectangles = list(map(xywh_to_vec_rect, rectangles)) + rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles) + rectangles = map(vec_rect_to_xywh, rectangles) + return rectangles + + +Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") + + +def make_box(x1, y1, x2, y2): + keys = "x1", "y1", "x2", "y2" + return dict(zip(keys, [x1, y1, x2, y2])) + + +def compute_intersection(a, b): + + dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin) + dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin) + + return dx * dy if (dx >= 0) and (dy >= 0) else 0 + + +def has_no_parent(hierarchy): + return hierarchy[-1] <= 0 + + +def xywh_to_vec_rect(rect): + x1, y1, w, h = rect + x2 = x1 + w + y2 = y1 + h + return Rectangle(x1, y1, x2, y2) + + +def vec_rect_to_xywh(rect): + x, y, x2, y2 = rect + w = x2 - x + h = y2 - y + return x, y, w, h diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py index 3a7d2b1..4189005 100644 --- a/vidocp/utils/text.py +++ b/vidocp/utils/text.py @@ -54,4 +54,4 @@ def find_primary_text_regions(image): cnts = filter(is_likely_primary_text_segments, cnts) - return cnts \ No newline at end of file + return cnts diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py index 2121804..18c8eb2 100644 --- a/vidocp/utils/utils.py +++ b/vidocp/utils/utils.py @@ -1,23 +1,4 @@ -from collections import namedtuple -from functools import partial - import cv2 -import numpy as np -from matplotlib import pyplot as plt - - -def show_mpl(image): - - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) - ax.imshow(image) - plt.show() - - -def show_cv2(image): - - cv2.imshow("", image) - cv2.waitKey(0) def copy_and_normalize_channels(image): @@ -29,161 +10,3 @@ def copy_and_normalize_channels(image): pass return image - - -def draw_contours(image, contours): - - image = copy_and_normalize_channels(image) - - for cont in contours: - cv2.drawContours(image, cont, -1, (0, 255, 0), 4) - - return image - - -def draw_rectangles(image, rectangles, color=None): - - image = copy_and_normalize_channels(image) - - if not color: - color = (0, 255, 0) - - for rect in rectangles: - x, y, w, h = rect - cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) - - return image - - -def draw_stats(image, stats, annotate=False): - - image = copy_and_normalize_channels(image) - - keys = ["x", "y", "w", "h"] - - def annotate_stat(x, y, w, h): - - for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) - - def draw_stat(stat): - - x, y, w, h, area = stat - - cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) - - if annotate: - annotate_stat(x, y, w, h) - - for stat in stats[2:]: - draw_stat(stat) - - return image - - -def remove_overlapping(rectangles): - def overlap(a, b): - return compute_intersection(a, b) > 0 - - def does_not_overlap(rect, rectangles): - return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2) - - rectangles = list(map(xywh_to_vec_rect, rectangles)) - rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles) - rectangles = map(vec_rect_to_xywh, rectangles) - return rectangles - - -def remove_included(rectangles): - def included(a, b): - return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax - - def is_not_included(rect, rectangles): - return not any(included(r2, rect) for r2 in rectangles if not rect == r2) - - rectangles = list(map(xywh_to_vec_rect, rectangles)) - rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles) - rectangles = map(vec_rect_to_xywh, rectangles) - return rectangles - - -Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") - - -def make_box(x1, y1, x2, y2): - keys = "x1", "y1", "x2", "y2" - return dict(zip(keys, [x1, y1, x2, y2])) - - -def compute_intersection(a, b): - - dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin) - dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin) - - return dx * dy if (dx >= 0) and (dy >= 0) else 0 - - -def has_no_parent(hierarchy): - return hierarchy[-1] <= 0 - - -def xywh_to_vec_rect(rect): - x1, y1, w, h = rect - x2 = x1 + w - y2 = y1 + h - return Rectangle(x1, y1, x2, y2) - - -def vec_rect_to_xywh(rect): - x, y, x2, y2 = rect - w = x2 - x - h = y2 - y - return x, y, w, h - - -def detect_large_coherent_structures(image: np.array): - """Detects large coherent structures on an image. - - References: - https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection - """ - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] - - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) - dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) - - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) - close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) - - cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - return cnts - - -def is_large_enough(cont, min_area): - return cv2.contourArea(cont, False) > min_area - - -def has_acceptable_format(cont, max_width_to_height_ratio): - _, _, w, h = cv2.boundingRect(cont) - return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio) - - -def is_filled(hierarchy): - """Checks whether a hierarchy is filled. - - References: - https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv - """ - return hierarchy[3] <= 0 and hierarchy[2] == -1 - - -def is_boxy(contour): - epsilon = 0.01 * cv2.arcLength(contour, True) - approx = cv2.approxPolyDP(contour, epsilon, True) - return len(approx) <= 10 From 36a62a13e51148d2420cb12930e84d78629db6b0 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:54:53 +0100 Subject: [PATCH 12/27] refactoring --- scripts/annotate.py | 4 ++-- vidocp/redaction_detection.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/annotate.py b/scripts/annotate.py index 682b8ad..9ef1bce 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,7 +1,7 @@ import argparse from vidocp.table_parsing import annotate_tables_in_pdf -from vidocp.redaction_detection import annotate_boxes_in_pdf +from vidocp.redaction_detection import annotate_redactions_in_pdf from vidocp.layout_parsing import annotate_layout_in_pdf from vidocp.figure_detection import detect_figures_in_pdf @@ -22,7 +22,7 @@ if __name__ == "__main__": if args.type == "table": annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "redaction": - annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index) + annotate_redactions_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "layout": annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "figure": diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index 1843f60..3362dc6 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -30,7 +30,7 @@ def find_redactions(image: np.array, min_normalized_area=200000): return contours -def annotate_boxes_in_pdf(pdf_path, page_index=1): +def annotate_redactions_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) From 106b333dca49780368c96400956f3b8186754f52 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 16:44:07 +0100 Subject: [PATCH 13/27] filtering for connected cells... but does not quite work yet --- vidocp/table_parsing.py | 27 ++++++++++++++++++++++----- vidocp/utils/post_processing.py | 24 +++++++++++++++++++----- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index c991d43..5b811b8 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -1,14 +1,20 @@ +from functools import partial + import cv2 import numpy as np from pdf2image import pdf2image from vidocp.utils.display import show_mpl -from vidocp.utils.draw import draw_stats +from vidocp.utils.draw import draw_stats, draw_rectangles +from vidocp.utils.filters import is_large_enough +from vidocp.utils.post_processing import remove_isolated def add_external_contours(image, img): - contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + # contours = filter(partial(is_large_enough, min_area=5000000), contours) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) @@ -40,9 +46,20 @@ def parse_table(image: np.array): img_bin = isolate_vertical_and_horizontal_components(img_bin) img_bin_final = add_external_contours(img_bin, img_bin) - _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) - return stats + def is_large_enough(stat): + x1, y1, w, h, area = stat + return area > 3000 + + stats = np.vstack(list(filter(is_large_enough, stats))) + + rects = stats[:, :-1][2:] + + # FIXME: For some reason some isolated rects remain. + rects = remove_isolated(rects) + + return rects def annotate_tables_in_pdf(pdf_path, page_index=1): @@ -51,6 +68,6 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): page = np.array(page) stats = parse_table(page) - page = draw_stats(page, stats) + page = draw_rectangles(page, stats) show_mpl(page) diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 0cd7f62..c05ab03 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -1,5 +1,6 @@ from collections import namedtuple from functools import partial +from itertools import starmap def remove_overlapping(rectangles): @@ -28,14 +29,27 @@ def remove_included(rectangles): return rectangles +# FIXME: For some reason some isolated rects remain. +def remove_isolated(rectangles): + def are_neighbours(a, b): + + def adjacent(n, m): + return abs(n - m) <= 1 + + return any(starmap(adjacent, [(b.xmin, a.xmax), (b.ymin, a.ymax), (b.xmax, a.xmin), (b.ymax, a.ymin)])) + + def is_connected(rect, rectangles): + return any(are_neighbours(r2, rect) for r2 in rectangles if not rect == r2) + + rectangles = list(map(xywh_to_vec_rect, rectangles)) + rectangles = filter(partial(is_connected, rectangles=rectangles), rectangles) + rectangles = map(vec_rect_to_xywh, rectangles) + return rectangles + + Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") -def make_box(x1, y1, x2, y2): - keys = "x1", "y1", "x2", "y2" - return dict(zip(keys, [x1, y1, x2, y2])) - - def compute_intersection(a, b): dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin) From 0fc6cf8008b9a5860789eeda38a7cae03764f290 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 20:00:38 +0100 Subject: [PATCH 14/27] fixed bug in adjaceny test --- vidocp/table_parsing.py | 2 +- vidocp/utils/post_processing.py | 69 ++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 5b811b8..c23aa5b 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -57,7 +57,7 @@ def parse_table(image: np.array): rects = stats[:, :-1][2:] # FIXME: For some reason some isolated rects remain. - rects = remove_isolated(rects) + rects = remove_isolated(rects, input_sorted=True) return rects diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index c05ab03..13f3149 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -1,6 +1,6 @@ from collections import namedtuple from functools import partial -from itertools import starmap +from itertools import starmap, compress def remove_overlapping(rectangles): @@ -29,24 +29,73 @@ def remove_included(rectangles): return rectangles +def adjacent1d(n, m, tolerance=1): + return abs(n - m) <= tolerance + + +def adjacent(a, b): + """Two rects (v1, v2), (w1, w2) are adjacent if either of: + - the x components of v2 and w1 match and the y components of w1 or w2 are in the range of the y components of v1 and v2 + - the x components of v1 and w2 match and the y components of w1 or w2 are in the range of the y components of v1 and v2 + - the y components of v2 and w1 match and the x components of w1 or w2 are in the range of the x components of v1 and v2 + - the y components of v1 and w2 match and the x components of w1 or w2 are in the range of the x components of v1 and v2 + """ + + def adjacent2d(g, h, i, j, k, l): + return adjacent1d(g, h) and any(k <= p <= l for p in [i, j]) + + if any(x is None for x in (a, b)): + return False + + v1 = a.xmin, a.ymin + v2 = a.xmax, a.ymax + + w1 = b.xmin, b.ymin + w2 = b.xmax, b.ymax + + return any( + ( + adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), + adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]), + adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]), + adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]), + ) + ) + + # FIXME: For some reason some isolated rects remain. -def remove_isolated(rectangles): - def are_neighbours(a, b): - - def adjacent(n, m): - return abs(n - m) <= 1 - - return any(starmap(adjacent, [(b.xmin, a.xmax), (b.ymin, a.ymax), (b.xmax, a.xmin), (b.ymax, a.ymin)])) - +def __remove_isolated_unsorted(rectangles): def is_connected(rect, rectangles): - return any(are_neighbours(r2, rect) for r2 in rectangles if not rect == r2) + return any(adjacent(r2, rect) for r2 in rectangles if not rect == r2) rectangles = list(map(xywh_to_vec_rect, rectangles)) rectangles = filter(partial(is_connected, rectangles=rectangles), rectangles) rectangles = map(vec_rect_to_xywh, rectangles) + return rectangles +def __remove_isolated_sorted(rectangles): + def is_connected(left, center, right): + # if center == Rectangle(xmin=337, ymin=154, xmax=512, ymax=187) or center == Rectangle(xmin=719, ymin=188, xmax=781, ymax=251): + return any(starmap(adjacent, [(left, center), (center, right)])) + + rectangles = list(map(xywh_to_vec_rect, rectangles)) + + lefts = [None, *rectangles[:-1]] + rights = [*rectangles[1:], None] + + mask = starmap(is_connected, zip(lefts, rectangles, rights)) + rectangles = compress(rectangles, mask) + rectangles = map(vec_rect_to_xywh, rectangles) + + return rectangles + + +def remove_isolated(rectangles, input_sorted=False): + return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(rectangles) + + Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") From 36284f9a78a5aecbe893a2f3f66de7a2054a7a3d Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 20:01:00 +0100 Subject: [PATCH 15/27] removed obsolete lines --- vidocp/utils/post_processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 13f3149..f390b03 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -77,7 +77,6 @@ def __remove_isolated_unsorted(rectangles): def __remove_isolated_sorted(rectangles): def is_connected(left, center, right): - # if center == Rectangle(xmin=337, ymin=154, xmax=512, ymax=187) or center == Rectangle(xmin=719, ymin=188, xmax=781, ymax=251): return any(starmap(adjacent, [(left, center), (center, right)])) rectangles = list(map(xywh_to_vec_rect, rectangles)) From 90b8613bf8677901e81ca5ee72ab0d80fae97c3f Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 21:03:40 +0100 Subject: [PATCH 16/27] filtering non-tables by bounding rect check WIP --- vidocp/table_parsing.py | 56 +++++++++++++++++++++++++++------ vidocp/utils/draw.py | 12 +++++-- vidocp/utils/post_processing.py | 13 +++++++- 3 files changed, 68 insertions(+), 13 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index c23aa5b..c4d8485 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -1,4 +1,6 @@ from functools import partial +from itertools import chain, starmap +from operator import attrgetter import cv2 import numpy as np @@ -7,7 +9,7 @@ from pdf2image import pdf2image from vidocp.utils.display import show_mpl from vidocp.utils.draw import draw_stats, draw_rectangles from vidocp.utils.filters import is_large_enough -from vidocp.utils.post_processing import remove_isolated +from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d def add_external_contours(image, img): @@ -37,8 +39,46 @@ def isolate_vertical_and_horizontal_components(img_bin): return img_bin_final +def has_table_shape(rects): + + assert isinstance(rects, list) + + points = list(chain(*map(xywh_to_vecs, rects))) + brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points))) + + rects = list(map(xywh_to_vec_rect, rects)) + + # print(rects) + # print(brect) + + def matches_bounding_rect_corner(rect, x, y): + corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect]))) + ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords)) + # print() + # print(x, y) + # print(brect) + # print(rect) + # print(corresp_coords) + # print(ret) + + return ret + + return all( + ( + any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects), + any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects), + any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects), + any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects), + ) + ) + + def parse_table(image: np.array): + def is_large_enough(stat): + x1, y1, w, h, area = stat + return area > 3000 + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) img_bin = ~img_bin @@ -48,16 +88,14 @@ def parse_table(image: np.array): _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) - def is_large_enough(stat): - x1, y1, w, h, area = stat - return area > 3000 - stats = np.vstack(list(filter(is_large_enough, stats))) - rects = stats[:, :-1][2:] + rects = list(remove_isolated(rects, input_sorted=True)) - # FIXME: For some reason some isolated rects remain. - rects = remove_isolated(rects, input_sorted=True) + # print(f"{has_table_shape(rects) = }") + # if not has_table_shape(rects): + # print(111111111111111111111) + # return [] return rects @@ -68,6 +106,6 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): page = np.array(page) stats = parse_table(page) - page = draw_rectangles(page, stats) + page = draw_rectangles(page, stats, annotate=True) show_mpl(page) diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py index 32c66f6..13e6a73 100644 --- a/vidocp/utils/draw.py +++ b/vidocp/utils/draw.py @@ -13,7 +13,10 @@ def draw_contours(image, contours): return image -def draw_rectangles(image, rectangles, color=None): +def draw_rectangles(image, rectangles, color=None, annotate=False): + + def annotate_rect(x, y, w, h): + cv2.putText(image, "+", (x + (w // 2) - 12, y + (h // 2) + 9), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) image = copy_and_normalize_channels(image) @@ -24,13 +27,14 @@ def draw_rectangles(image, rectangles, color=None): x, y, w, h = rect cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) + if annotate: + annotate_rect(x, y, w, h) + return image def draw_stats(image, stats, annotate=False): - image = copy_and_normalize_channels(image) - keys = ["x", "y", "w", "h"] def annotate_stat(x, y, w, h): @@ -50,6 +54,8 @@ def draw_stats(image, stats, annotate=False): if annotate: annotate_stat(x, y, w, h) + image = copy_and_normalize_channels(image) + for stat in stats[2:]: draw_stat(stat) diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index f390b03..a3f1272 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -111,10 +111,21 @@ def has_no_parent(hierarchy): def xywh_to_vec_rect(rect): + v1, v2 = xywh_to_vecs(rect) + return Rectangle(*v1, *v2) + + +def vecs_to_vec_rect(rect): + print(rect) + v1, v2 = rect + return Rectangle(*v1, *v2) + + +def xywh_to_vecs(rect): x1, y1, w, h = rect x2 = x1 + w y2 = y1 + h - return Rectangle(x1, y1, x2, y2) + return (x1, y1), (x2, y2) def vec_rect_to_xywh(rect): From 295666c28f2a2b24783d0a27f0d71631a7c9a7d2 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 21:25:01 +0100 Subject: [PATCH 17/27] added todo comments --- vidocp/table_parsing.py | 5 ++++- vidocp/utils/post_processing.py | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index c4d8485..85989e7 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -6,7 +6,7 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils.display import show_mpl +from vidocp.utils.display import show_mpl, show_cv2 from vidocp.utils.draw import draw_stats, draw_rectangles from vidocp.utils.filters import is_large_enough from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d @@ -39,6 +39,7 @@ def isolate_vertical_and_horizontal_components(img_bin): return img_bin_final +# FIXME: does not work yet def has_table_shape(rects): assert isinstance(rects, list) @@ -83,6 +84,7 @@ def parse_table(image: np.array): th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) img_bin = ~img_bin + img_bin = isolate_vertical_and_horizontal_components(img_bin) img_bin_final = add_external_contours(img_bin, img_bin) @@ -90,6 +92,7 @@ def parse_table(image: np.array): stats = np.vstack(list(filter(is_large_enough, stats))) rects = stats[:, :-1][2:] + # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table` rects = list(remove_isolated(rects, input_sorted=True)) # print(f"{has_table_shape(rects) = }") diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index a3f1272..06dc1d9 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -17,11 +17,12 @@ def remove_overlapping(rectangles): def remove_included(rectangles): - def included(a, b): - return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax + def includes(a, b, tol=3): + """does a include b?""" + return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax def is_not_included(rect, rectangles): - return not any(included(r2, rect) for r2 in rectangles if not rect == r2) + return not any(includes(r2, rect) for r2 in rectangles if not rect == r2) rectangles = list(map(xywh_to_vec_rect, rectangles)) rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles) From 87cecadb440b9437aca1b968b5a4e92c6023c24f Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 21:27:39 +0100 Subject: [PATCH 18/27] applied black --- vidocp/table_parsing.py | 2 -- vidocp/utils/draw.py | 1 - 2 files changed, 3 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 85989e7..580b2f3 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -75,7 +75,6 @@ def has_table_shape(rects): def parse_table(image: np.array): - def is_large_enough(stat): x1, y1, w, h, area = stat return area > 3000 @@ -84,7 +83,6 @@ def parse_table(image: np.array): th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) img_bin = ~img_bin - img_bin = isolate_vertical_and_horizontal_components(img_bin) img_bin_final = add_external_contours(img_bin, img_bin) diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py index 13e6a73..7b23f0d 100644 --- a/vidocp/utils/draw.py +++ b/vidocp/utils/draw.py @@ -14,7 +14,6 @@ def draw_contours(image, contours): def draw_rectangles(image, rectangles, color=None, annotate=False): - def annotate_rect(x, y, w, h): cv2.putText(image, "+", (x + (w // 2) - 12, y + (h // 2) + 9), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) From f7d3e396921c04f7d463db7d7b858f7294313937 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Tue, 8 Feb 2022 15:05:12 +0100 Subject: [PATCH 19/27] nix dolles --- vidocp/layout_detection.py | 1 + vidocp/table_parsing.py | 7 +++---- vidocp/utils/draw.py | 29 ----------------------------- 3 files changed, 4 insertions(+), 33 deletions(-) diff --git a/vidocp/layout_detection.py b/vidocp/layout_detection.py index d559df0..1d49684 100644 --- a/vidocp/layout_detection.py +++ b/vidocp/layout_detection.py @@ -23,6 +23,7 @@ def find_layout_boxes(image: np.array): contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = imutils.grab_contours(contours) + for c in contours: peri = cv2.arcLength(c, True) approx = cv2.approxPolyDP(c, 0.04 * peri, True) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 580b2f3..2ead96c 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -6,10 +6,9 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils.display import show_mpl, show_cv2 -from vidocp.utils.draw import draw_stats, draw_rectangles -from vidocp.utils.filters import is_large_enough -from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated def add_external_contours(image, img): diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py index 7b23f0d..2f7ef06 100644 --- a/vidocp/utils/draw.py +++ b/vidocp/utils/draw.py @@ -30,32 +30,3 @@ def draw_rectangles(image, rectangles, color=None, annotate=False): annotate_rect(x, y, w, h) return image - - -def draw_stats(image, stats, annotate=False): - - keys = ["x", "y", "w", "h"] - - def annotate_stat(x, y, w, h): - - for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) - - def draw_stat(stat): - - x, y, w, h, area = stat - - cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) - - if annotate: - annotate_stat(x, y, w, h) - - image = copy_and_normalize_channels(image) - - for stat in stats[2:]: - draw_stat(stat) - - return image From 4964c8f5a154a24e967665d169fd72e4e9673538 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 10 Feb 2022 10:22:22 +0100 Subject: [PATCH 20/27] some changes to fix some minor bugs in table_parsing.py and post_processing.py --- vidocp/table_parsing.py | 10 ++++++++-- vidocp/utils/post_processing.py | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 2ead96c..0131c3c 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -76,10 +76,13 @@ def has_table_shape(rects): def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat - return area > 3000 + # was set too higg (3000): Boxes in a Table can definetly be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters + # with extra condition for the length of height and width weirdly narrow rectangles can be filtered + return area > 500 and w > 35 and h > 15 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + #changed threshold value from 150 to 200 b + th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin img_bin = isolate_vertical_and_horizontal_components(img_bin) @@ -88,9 +91,12 @@ def parse_table(image: np.array): _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) stats = np.vstack(list(filter(is_large_enough, stats))) + print(stats) rects = stats[:, :-1][2:] + # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table` rects = list(remove_isolated(rects, input_sorted=True)) + print(rects) # print(f"{has_table_shape(rects) = }") # if not has_table_shape(rects): diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 06dc1d9..77f8cab 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -29,8 +29,8 @@ def remove_included(rectangles): rectangles = map(vec_rect_to_xywh, rectangles) return rectangles - -def adjacent1d(n, m, tolerance=1): +#tolerance was set too low (1) most lines are 2px wide +def adjacent1d(n, m, tolerance=2): return abs(n - m) <= tolerance @@ -43,6 +43,7 @@ def adjacent(a, b): """ def adjacent2d(g, h, i, j, k, l): + #print(abs(g-h), [k <= p <= l for p in [i, j]]) return adjacent1d(g, h) and any(k <= p <= l for p in [i, j]) if any(x is None for x in (a, b)): @@ -50,10 +51,17 @@ def adjacent(a, b): v1 = a.xmin, a.ymin v2 = a.xmax, a.ymax - + print("topleft and bottom right rec1", v1,v2) w1 = b.xmin, b.ymin w2 = b.xmax, b.ymax - + print("topleft and bottom right rec2", w1, w2) + # some rectangles are compared twice + print(( + adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), + adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]), + adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]), + adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]), + )) return any( ( adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), @@ -81,6 +89,7 @@ def __remove_isolated_sorted(rectangles): return any(starmap(adjacent, [(left, center), (center, right)])) rectangles = list(map(xywh_to_vec_rect, rectangles)) + # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles) lefts = [None, *rectangles[:-1]] rights = [*rectangles[1:], None] @@ -117,7 +126,6 @@ def xywh_to_vec_rect(rect): def vecs_to_vec_rect(rect): - print(rect) v1, v2 = rect return Rectangle(*v1, *v2) From 07907d45dd3c0a0fa6383ec4a938846ed8c375b1 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 10 Feb 2022 10:56:03 +0100 Subject: [PATCH 21/27] some changes to fix some minor bugs in table_parsing.py and post_processing.py --- vidocp/table_parsing.py | 8 +++----- vidocp/utils/post_processing.py | 17 ++++++++--------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 0131c3c..adaa210 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -76,12 +76,12 @@ def has_table_shape(rects): def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat - # was set too higg (3000): Boxes in a Table can definetly be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters - # with extra condition for the length of height and width weirdly narrow rectangles can be filtered + # was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters + # with extra condition for the length of height and width, weirdly narrow rectangles can be filtered return area > 500 and w > 35 and h > 15 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - #changed threshold value from 150 to 200 b + #changed threshold value from 150 to 200 because of a shaded edgecase table th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin @@ -91,12 +91,10 @@ def parse_table(image: np.array): _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) stats = np.vstack(list(filter(is_large_enough, stats))) - print(stats) rects = stats[:, :-1][2:] # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table` rects = list(remove_isolated(rects, input_sorted=True)) - print(rects) # print(f"{has_table_shape(rects) = }") # if not has_table_shape(rects): diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 77f8cab..6cc9452 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -43,7 +43,6 @@ def adjacent(a, b): """ def adjacent2d(g, h, i, j, k, l): - #print(abs(g-h), [k <= p <= l for p in [i, j]]) return adjacent1d(g, h) and any(k <= p <= l for p in [i, j]) if any(x is None for x in (a, b)): @@ -51,17 +50,17 @@ def adjacent(a, b): v1 = a.xmin, a.ymin v2 = a.xmax, a.ymax - print("topleft and bottom right rec1", v1,v2) + #print("topleft and bottom right rec1", v1,v2) w1 = b.xmin, b.ymin w2 = b.xmax, b.ymax - print("topleft and bottom right rec2", w1, w2) + #print("topleft and bottom right rec2", w1, w2) # some rectangles are compared twice - print(( - adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), - adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]), - adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]), - adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]), - )) + # print(( + # adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), + # adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]), + # adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]), + # adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]), + # )) return any( ( adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), From 885fc22f9de9b4e44d7657b117b260f0a774f091 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 11 Feb 2022 15:59:54 +0100 Subject: [PATCH 22/27] added changes to parse scanned pdfs --- .gitignore | 1 + vidocp/table_parsing.py | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bac3af5 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/pdfs/ diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index adaa210..455e9f3 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -23,6 +23,19 @@ def add_external_contours(image, img): return image +def process_lines(img_bin_h, img_bin_v): + def draw_lines(lines, img_bin): + for line in lines: + for x1, y1, x2, y2 in line: + cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 3) + return img_bin + lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500, 700, 0) + draw_lines(lines_h, img_bin_h) + + lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 700, 0) + draw_lines(lines_v,img_bin_v) + + return img_bin_h, img_bin_v def isolate_vertical_and_horizontal_components(img_bin): @@ -33,6 +46,18 @@ def isolate_vertical_and_horizontal_components(img_bin): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + #img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v) + + # lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500) + # for line in lines_h: + # for x1, y1, x2, y2 in line: + # cv2.line(img_bin_h, (x1, y1), (x2, y2), (255, 255, 255), 3) + # lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 600, 0) + # for line in lines_v: + # for x1, y1, x2, y2 in line: + # cv2.line(img_bin_v, (x1, y1), (x2, y2), (255, 255, 255), 3) + + img_bin_final = img_bin_h | img_bin_v return img_bin_final @@ -73,15 +98,21 @@ def has_table_shape(rects): ) + + + def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat - # was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters + # was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters # with extra condition for the length of height and width, weirdly narrow rectangles can be filtered return area > 500 and w > 35 and h > 15 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - #changed threshold value from 150 to 200 because of a shaded edgecase table + # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE) + # th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY) + + # changed threshold value from 150 to 195 because of a shaded edgecase table th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin @@ -96,10 +127,6 @@ def parse_table(image: np.array): # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table` rects = list(remove_isolated(rects, input_sorted=True)) - # print(f"{has_table_shape(rects) = }") - # if not has_table_shape(rects): - # print(111111111111111111111) - # return [] return rects From c2faf7d00bf4ed9d20d5d3422d172ae35d79643c Mon Sep 17 00:00:00 2001 From: llocarnini Date: Mon, 14 Feb 2022 11:04:04 +0100 Subject: [PATCH 23/27] adjusted isolation of vertical and horizontal components to be more robust to scanned pages; work in progress --- vidocp/table_parsing.py | 50 ++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 455e9f3..88a8790 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -10,6 +10,7 @@ from vidocp.utils.display import show_mpl from vidocp.utils.draw import draw_rectangles from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated +import matplotlib.pyplot as plt def add_external_contours(image, img): @@ -27,12 +28,12 @@ def process_lines(img_bin_h, img_bin_v): def draw_lines(lines, img_bin): for line in lines: for x1, y1, x2, y2 in line: - cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 3) + cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 6) return img_bin - lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500, 700, 0) + lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi / 180, 500, 500, 250) draw_lines(lines_h, img_bin_h) - lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 700, 0) + lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 500, 250) draw_lines(lines_v,img_bin_v) return img_bin_h, img_bin_v @@ -46,20 +47,17 @@ def isolate_vertical_and_horizontal_components(img_bin): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - #img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v) - - # lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500) - # for line in lines_h: - # for x1, y1, x2, y2 in line: - # cv2.line(img_bin_h, (x1, y1), (x2, y2), (255, 255, 255), 3) - # lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 600, 0) - # for line in lines_v: - # for x1, y1, x2, y2 in line: - # cv2.line(img_bin_v, (x1, y1), (x2, y2), (255, 255, 255), 3) - + img_bin_h = cv2.dilate(img_bin_h, kernel_h, 1) + img_bin_v = cv2.dilate(img_bin_v, kernel_v, 1) + img_bin_h = apply_motion_blur(img_bin_h, 100, 0) + img_bin_v = apply_motion_blur(img_bin_v, 100, 90) + # img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v) img_bin_final = img_bin_h | img_bin_v - + kernel = np.ones((5, 5), np.uint8) + # img_bin_final = cv2.dilate(img_bin_final, kernel, 2) + th1, img_bin_final = cv2.threshold(img_bin_final, 10, 255, cv2.THRESH_BINARY) + show_mpl(img_bin_final) return img_bin_final @@ -99,21 +97,27 @@ def has_table_shape(rects): +def apply_motion_blur(image, size, angle): + k = np.zeros((size, size), dtype=np.float32) + k[ (size-1)// 2 , :] = np.ones(size, dtype=np.float32) + k = cv2.warpAffine(k, cv2.getRotationMatrix2D( (size / 2 -0.5 , size / 2 -0.5 ) , angle, 1.0), (size, size) ) + k = k * ( 1.0 / np.sum(k) ) + return cv2.filter2D(image, -1, k) def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat # was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters - # with extra condition for the length of height and width, weirdly narrow rectangles can be filtered + # with extra condition for the length of height and width weirdly narrow rectangles can be filtered return area > 500 and w > 35 and h > 15 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE) - # th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY) - + blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE) + th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY) + show_mpl(img_bin) # changed threshold value from 150 to 195 because of a shaded edgecase table - th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) + # th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin img_bin = isolate_vertical_and_horizontal_components(img_bin) @@ -127,6 +131,10 @@ def parse_table(image: np.array): # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table` rects = list(remove_isolated(rects, input_sorted=True)) + # if not has_table_shape(rects): + # return False + + return rects @@ -138,5 +146,7 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): stats = parse_table(page) page = draw_rectangles(page, stats, annotate=True) + # if stats: + # page = draw_rectangles(page, stats, annotate=True) show_mpl(page) From 57ca47f38d68fe56baee483b1296d11e8a99e58b Mon Sep 17 00:00:00 2001 From: llocarnini Date: Wed, 16 Feb 2022 12:37:17 +0100 Subject: [PATCH 24/27] different approaches to isolate line components of tables in scanned pdf files. --- .gitignore | 1 + vidocp/table_parsing.py | 119 ++++++++++++++++++++++++++++------------ 2 files changed, 86 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index bac3af5..1cf261d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /pdfs/ +/results/ diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 88a8790..2301ac1 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -10,10 +10,8 @@ from vidocp.utils.display import show_mpl from vidocp.utils.draw import draw_rectangles from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated -import matplotlib.pyplot as plt def add_external_contours(image, img): - contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # contours = filter(partial(is_large_enough, min_area=5000000), contours) @@ -24,46 +22,107 @@ def add_external_contours(image, img): return image -def process_lines(img_bin_h, img_bin_v): - def draw_lines(lines, img_bin): - for line in lines: + +def process_lines(img_line_component): + def draw_lines(detected_lines, img_bin): + for line in detected_lines: for x1, y1, x2, y2 in line: cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 6) return img_bin - lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi / 180, 500, 500, 250) - draw_lines(lines_h, img_bin_h) - lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 500, 250) - draw_lines(lines_v,img_bin_v) + lines = cv2.HoughLines(img_line_component, 1, np.pi / 180, 500) + draw_lines(lines, lines) + + return img_line_component + +# def isolate_vertical_and_horizontal_components(img_bin): +# line_min_width = 50 +# kernel_h = np.ones((1, line_min_width), np.uint8) +# kernel_v = np.ones((line_min_width, 1), np.uint8) +# +# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) +# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) +# show_mpl(img_bin_h | img_bin_v) +# +# img_bin_h = apply_motion_blur(img_bin_h, 140, 0) +# img_bin_v = apply_motion_blur(img_bin_v, 140, 90) +# show_mpl(img_bin_h | img_bin_v) +# +# th1, img_bin_h = cv2.threshold(img_bin_h, 95, 255, cv2.THRESH_BINARY) +# th1, img_bin_v = cv2.threshold(img_bin_v, 95, 255, cv2.THRESH_BINARY) +# show_mpl(img_bin_h | img_bin_v) +# +# kernel_h = np.ones((1, 8), np.uint8) +# kernel_v = np.ones((8, 1), np.uint8) +# img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=4) +# img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=4) +# +# img_bin_final = img_bin_h | img_bin_v +# show_mpl(img_bin_final) +# # th 130 +# #th1, img_bin_final = cv2.threshold(img_bin_final, 90, 255, cv2.THRESH_BINARY) +# #show_mpl(img_bin_final) +# return img_bin_final - return img_bin_h, img_bin_v def isolate_vertical_and_horizontal_components(img_bin): - line_min_width = 30 kernel_h = np.ones((1, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 1), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + show_mpl(img_bin_h | img_bin_v) - img_bin_h = cv2.dilate(img_bin_h, kernel_h, 1) - img_bin_v = cv2.dilate(img_bin_v, kernel_v, 1) + img_bin_h = apply_motion_blur(img_bin_h, 150, 0) + img_bin_v = apply_motion_blur(img_bin_v, 150, 90) + show_mpl(img_bin_h | img_bin_v) + + th1, img_bin_h = cv2.threshold(img_bin_h, 70, 255, cv2.THRESH_BINARY) + th1, img_bin_v = cv2.threshold(img_bin_v, 70, 255, cv2.THRESH_BINARY) + show_mpl(img_bin_h | img_bin_v) + + kernel_h = np.ones((1, 10), np.uint8) + kernel_v = np.ones((10, 1), np.uint8) + img_bin_h = cv2.erode(img_bin_h, kernel_h, iterations=1) + img_bin_v = cv2.erode(img_bin_v, kernel_v, iterations=1) - img_bin_h = apply_motion_blur(img_bin_h, 100, 0) - img_bin_v = apply_motion_blur(img_bin_v, 100, 90) - # img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v) img_bin_final = img_bin_h | img_bin_v - kernel = np.ones((5, 5), np.uint8) - # img_bin_final = cv2.dilate(img_bin_final, kernel, 2) - th1, img_bin_final = cv2.threshold(img_bin_final, 10, 255, cv2.THRESH_BINARY) show_mpl(img_bin_final) + # th 130 + # th1, img_bin_final = cv2.threshold(img_bin_final, 150, 255, cv2.THRESH_BINARY) + # show_mpl(img_bin_final) return img_bin_final +# def isolate_vertical_and_horizontal_components(img_bin): +# line_min_width = 30 +# kernel_h = np.ones((1, line_min_width), np.uint8) +# kernel_v = np.ones((line_min_width, 1), np.uint8) +# +# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) +# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) +# show_mpl(img_bin_h | img_bin_v) +# +# kernel_h = np.ones((1, 30), np.uint8) +# kernel_v = np.ones((30, 1), np.uint8) +# img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=1) +# img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=1) +# show_mpl(img_bin_h | img_bin_v) +# +# img_bin_h = apply_motion_blur(img_bin_h, 100, 0) +# img_bin_v = apply_motion_blur(img_bin_v, 100, 90) +# +# img_bin_final = img_bin_h | img_bin_v +# show_mpl(img_bin_final) +# # th 130 +# th1, img_bin_final = cv2.threshold(img_bin_final, 125, 255, cv2.THRESH_BINARY) +# show_mpl(img_bin_final) +# +# return img_bin_final + # FIXME: does not work yet def has_table_shape(rects): - assert isinstance(rects, list) points = list(chain(*map(xywh_to_vecs, rects))) @@ -96,29 +155,24 @@ def has_table_shape(rects): ) - def apply_motion_blur(image, size, angle): k = np.zeros((size, size), dtype=np.float32) - k[ (size-1)// 2 , :] = np.ones(size, dtype=np.float32) - k = cv2.warpAffine(k, cv2.getRotationMatrix2D( (size / 2 -0.5 , size / 2 -0.5 ) , angle, 1.0), (size, size) ) - k = k * ( 1.0 / np.sum(k) ) + k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32) + k = cv2.warpAffine(k, cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0), (size, size)) + k = k * (1.0 / np.sum(k)) return cv2.filter2D(image, -1, k) def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat - # was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters - # with extra condition for the length of height and width weirdly narrow rectangles can be filtered return area > 500 and w > 35 and h > 15 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE) - th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY) - show_mpl(img_bin) - # changed threshold value from 150 to 195 because of a shaded edgecase table - # th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) + # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE) + th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin + show_mpl(img_bin) img_bin = isolate_vertical_and_horizontal_components(img_bin) img_bin_final = add_external_contours(img_bin, img_bin) @@ -134,13 +188,10 @@ def parse_table(image: np.array): # if not has_table_shape(rects): # return False - - return rects def annotate_tables_in_pdf(pdf_path, page_index=1): - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) From d70781f4aa8b3dd6286151d2cf85a85fe82e1caa Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 17 Feb 2022 16:45:55 +0100 Subject: [PATCH 25/27] changed tolerance in adjacent1 function in postprocessing.y from 2 to 4 added function so vertical and horizontal components do not overlap the layout box of the table --- vidocp/table_parsing.py | 120 ++++++++++---------------------- vidocp/utils/post_processing.py | 2 +- 2 files changed, 36 insertions(+), 86 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 2301ac1..dd65cd2 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -9,6 +9,7 @@ from pdf2image import pdf2image from vidocp.utils.display import show_mpl from vidocp.utils.draw import draw_rectangles from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated +from vidocp.layout_parsing import parse_layout def add_external_contours(image, img): @@ -23,50 +24,8 @@ def add_external_contours(image, img): return image -def process_lines(img_line_component): - def draw_lines(detected_lines, img_bin): - for line in detected_lines: - for x1, y1, x2, y2 in line: - cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 6) - return img_bin - - lines = cv2.HoughLines(img_line_component, 1, np.pi / 180, 500) - draw_lines(lines, lines) - - return img_line_component - -# def isolate_vertical_and_horizontal_components(img_bin): -# line_min_width = 50 -# kernel_h = np.ones((1, line_min_width), np.uint8) -# kernel_v = np.ones((line_min_width, 1), np.uint8) -# -# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) -# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) -# show_mpl(img_bin_h | img_bin_v) -# -# img_bin_h = apply_motion_blur(img_bin_h, 140, 0) -# img_bin_v = apply_motion_blur(img_bin_v, 140, 90) -# show_mpl(img_bin_h | img_bin_v) -# -# th1, img_bin_h = cv2.threshold(img_bin_h, 95, 255, cv2.THRESH_BINARY) -# th1, img_bin_v = cv2.threshold(img_bin_v, 95, 255, cv2.THRESH_BINARY) -# show_mpl(img_bin_h | img_bin_v) -# -# kernel_h = np.ones((1, 8), np.uint8) -# kernel_v = np.ones((8, 1), np.uint8) -# img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=4) -# img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=4) -# -# img_bin_final = img_bin_h | img_bin_v -# show_mpl(img_bin_final) -# # th 130 -# #th1, img_bin_final = cv2.threshold(img_bin_final, 90, 255, cv2.THRESH_BINARY) -# #show_mpl(img_bin_final) -# return img_bin_final - - -def isolate_vertical_and_horizontal_components(img_bin): - line_min_width = 30 +def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): + line_min_width = 47 kernel_h = np.ones((1, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 1), np.uint8) @@ -74,51 +33,32 @@ def isolate_vertical_and_horizontal_components(img_bin): img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) show_mpl(img_bin_h | img_bin_v) - img_bin_h = apply_motion_blur(img_bin_h, 150, 0) - img_bin_v = apply_motion_blur(img_bin_v, 150, 90) + kernel_h = np.ones((1, 30), np.uint8) + kernel_v = np.ones((30, 1), np.uint8) + img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2) + img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2) show_mpl(img_bin_h | img_bin_v) - th1, img_bin_h = cv2.threshold(img_bin_h, 70, 255, cv2.THRESH_BINARY) - th1, img_bin_v = cv2.threshold(img_bin_v, 70, 255, cv2.THRESH_BINARY) - show_mpl(img_bin_h | img_bin_v) - - kernel_h = np.ones((1, 10), np.uint8) - kernel_v = np.ones((10, 1), np.uint8) - img_bin_h = cv2.erode(img_bin_h, kernel_h, iterations=1) - img_bin_v = cv2.erode(img_bin_v, kernel_v, iterations=1) + img_bin_h = apply_motion_blur(img_bin_h, 100, 0) + img_bin_v = apply_motion_blur(img_bin_v, 100, 90) img_bin_final = img_bin_h | img_bin_v show_mpl(img_bin_final) - # th 130 - # th1, img_bin_final = cv2.threshold(img_bin_final, 150, 255, cv2.THRESH_BINARY) - # show_mpl(img_bin_final) + + th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY) + img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1) + show_mpl(img_bin_final) + img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects) + show_mpl(img_bin_final) + return img_bin_final -# def isolate_vertical_and_horizontal_components(img_bin): -# line_min_width = 30 -# kernel_h = np.ones((1, line_min_width), np.uint8) -# kernel_v = np.ones((line_min_width, 1), np.uint8) -# -# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) -# img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) -# show_mpl(img_bin_h | img_bin_v) -# -# kernel_h = np.ones((1, 30), np.uint8) -# kernel_v = np.ones((30, 1), np.uint8) -# img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=1) -# img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=1) -# show_mpl(img_bin_h | img_bin_v) -# -# img_bin_h = apply_motion_blur(img_bin_h, 100, 0) -# img_bin_v = apply_motion_blur(img_bin_v, 100, 90) -# -# img_bin_final = img_bin_h | img_bin_v -# show_mpl(img_bin_final) -# # th 130 -# th1, img_bin_final = cv2.threshold(img_bin_final, 125, 255, cv2.THRESH_BINARY) -# show_mpl(img_bin_final) -# -# return img_bin_final + +def disconnect_non_existing_cells(img_bin, bounding_rects): + for rect in bounding_rects: + x, y, w, h = rect + img_bin = cv2.rectangle(img_bin, (x, y), (x + w, y + h), (0, 0, 0), 5) + return img_bin # FIXME: does not work yet @@ -163,18 +103,28 @@ def apply_motion_blur(image, size, angle): return cv2.filter2D(image, -1, k) +def find_table_layout_boxes(image: np.array): + layout_boxes = parse_layout(image) + table_boxes = [] + for box in layout_boxes: + (x, y, w, h) = box + if w * h >= 300000: + table_boxes.append(box) + return table_boxes + + def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat - return area > 500 and w > 35 and h > 15 + return area > 2000 and w > 35 and h > 25 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE) th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin show_mpl(img_bin) - img_bin = isolate_vertical_and_horizontal_components(img_bin) + table_layout_boxes = find_table_layout_boxes(image) + img_bin = isolate_vertical_and_horizontal_components(img_bin, table_layout_boxes) img_bin_final = add_external_contours(img_bin, img_bin) _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 6cc9452..02018c3 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -30,7 +30,7 @@ def remove_included(rectangles): return rectangles #tolerance was set too low (1) most lines are 2px wide -def adjacent1d(n, m, tolerance=2): +def adjacent1d(n, m, tolerance=4): return abs(n - m) <= tolerance From 723c6606e1a3e2d192e9bfbb1ebc1e56c2cf8fe6 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 18 Feb 2022 16:35:50 +0100 Subject: [PATCH 26/27] kernel size for morphology ex set bit higher, so less non-table structure are detected. Reduced the kernel size of the directional motion blurr and increased the treshold a little bit so narrow cells wont be split up. Problem with the cell filtering for certain scanned pdfs detected. --- vidocp/table_parsing.py | 14 ++++++++------ vidocp/utils/post_processing.py | 14 ++------------ 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index dd65cd2..c43a457 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -25,7 +25,7 @@ def add_external_contours(image, img): def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): - line_min_width = 47 + line_min_width = 48 kernel_h = np.ones((1, line_min_width), np.uint8) kernel_v = np.ones((line_min_width, 1), np.uint8) @@ -39,15 +39,17 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects): img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2) show_mpl(img_bin_h | img_bin_v) - img_bin_h = apply_motion_blur(img_bin_h, 100, 0) - img_bin_v = apply_motion_blur(img_bin_v, 100, 90) + #reduced filtersize from 100 to 80 to minimize splitting narrow cells + img_bin_h = apply_motion_blur(img_bin_h, 80, 0) + img_bin_v = apply_motion_blur(img_bin_v, 80, 90) img_bin_final = img_bin_h | img_bin_v show_mpl(img_bin_final) - - th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY) + #changed threshold from 110 to 120 to minimize cell splitting + th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY) img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1) show_mpl(img_bin_final) + # problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22 img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects) show_mpl(img_bin_final) @@ -108,7 +110,7 @@ def find_table_layout_boxes(image: np.array): table_boxes = [] for box in layout_boxes: (x, y, w, h) = box - if w * h >= 300000: + if w * h >= 100000: table_boxes.append(box) return table_boxes diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 02018c3..79626d2 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -50,17 +50,10 @@ def adjacent(a, b): v1 = a.xmin, a.ymin v2 = a.xmax, a.ymax - #print("topleft and bottom right rec1", v1,v2) + w1 = b.xmin, b.ymin w2 = b.xmax, b.ymax - #print("topleft and bottom right rec2", w1, w2) - # some rectangles are compared twice - # print(( - # adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), - # adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]), - # adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]), - # adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]), - # )) + return any( ( adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), @@ -86,9 +79,7 @@ def __remove_isolated_unsorted(rectangles): def __remove_isolated_sorted(rectangles): def is_connected(left, center, right): return any(starmap(adjacent, [(left, center), (center, right)])) - rectangles = list(map(xywh_to_vec_rect, rectangles)) - # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles) lefts = [None, *rectangles[:-1]] rights = [*rectangles[1:], None] @@ -96,7 +87,6 @@ def __remove_isolated_sorted(rectangles): mask = starmap(is_connected, zip(lefts, rectangles, rights)) rectangles = compress(rectangles, mask) rectangles = map(vec_rect_to_xywh, rectangles) - return rectangles From 2a68e1b221881c1598864c719fe583d3b00a227c Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 18 Feb 2022 16:36:25 +0100 Subject: [PATCH 27/27] kernel size for morphology ex set bit higher, so less non-table structure are detected. Reduced the kernel size of the directional motion blurr and increased the treshold a little bit so narrow cells wont be split up. Problem with the cell filtering for certain scanned pdfs detected. --- vidocp/utils/post_processing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 79626d2..a3a04b1 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -43,6 +43,7 @@ def adjacent(a, b): """ def adjacent2d(g, h, i, j, k, l): + #print(adjacent1d(g, h) and any(k <= p <= l for p in [i, j])) return adjacent1d(g, h) and any(k <= p <= l for p in [i, j]) if any(x is None for x in (a, b)): @@ -78,6 +79,7 @@ def __remove_isolated_unsorted(rectangles): def __remove_isolated_sorted(rectangles): def is_connected(left, center, right): + # print(left,center,right, list(starmap(adjacent, [(left, center), (center, right)]))) return any(starmap(adjacent, [(left, center), (center, right)])) rectangles = list(map(xywh_to_vec_rect, rectangles))