Merge in RR/cv-analysis from diff-font-sizes-on-page to master
Squashed commit of the following:
commit d1b32a3e8fadd45d38040e1ba96672ace240ae29
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jun 30 14:43:30 2022 +0200
add tests for figure detection first iteration
commit c38a7701afaad513320f157fe7188b3f11a682ac
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jun 30 14:26:08 2022 +0200
update text tests with new test cases
commit ccc0c1a177c7d69c9575ec0267a492c3eef008e3
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Wed Jun 29 23:09:24 2022 +0200
added fixture for different scaled text on page and parameter for different font style
commit 5f36a634caad2849e673de7d64abb5b6c3a6055f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 17:03:52 2022 +0200
add pdf2pdf annotate script for figure detection
commit 7438c170371e166e82ab19f9dfdf1bddd89b7bb3
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 16:24:52 2022 +0200
optimize algorithm
commit 93bf8820f856d3815bab36b13c0df189c45d01e0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 16:11:15 2022 +0200
black
commit 59c639eec7d3f9da538b0ad6cd6215456c92eb58
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 16:10:39 2022 +0200
add tests for figure detection pipeline
commit bada688d88231843e9d299d255d9c4e0d5ca9788
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 13:34:36 2022 +0200
refactor tests
commit 614388a18b46d670527727c11f63e8174aed3736
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 13:34:14 2022 +0200
introduce pipeline logic for figure detection
commit 7195f892d543294829aebe80e260b4395b89cb36
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 11:58:41 2022 +0200
update reqs
commit 4408e7975853196c5e363dd2ddf62e15fe6f4944
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 11:56:16 2022 +0200
add figure detection test
commit 5ff472c2d96238ca2bc1d2368d3d02e62db98713
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 11:56:09 2022 +0200
add figure detection test
commit 66c1307e57c84789d64cb8e41d8e923ac98eebde
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 10:36:50 2022 +0200
refactor draw boxes to work as intended on inversed image
commit 00a39050d051ae43b2a8f2c4efd6bfbd2609dead
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jun 28 10:36:11 2022 +0200
refactor module structure
commit f8af01894c387468334a332e75f7dbf545a91f86
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jun 27 17:07:47 2022 +0200
add: figure detection now agnostic to input image background color, refactor tests
commit 3bc63da783bced571d53b29b6d82648c9f93e886
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jun 27 14:31:15 2022 +0200
add text removal tests
commit 6e794a7cee3fd7633aa5084839775877b0f8794c
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jun 27 12:12:27 2022 +0200
figure detection tests WIP
commit f8b20d4c9845de6434142e3dab69ce467fbc7a75
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jun 24 15:39:37 2022 +0200
add tests for figure_detection WIP
commit f2a52a07a5e261962214dff40ba710c93993f6fb
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Fri Jun 24 14:28:44 2022 +0200
added third test case "figure_and_text"
commit 8f45c88278cdcd32a121ea8269c8eca816bffd0b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jun 24 13:25:17 2022 +0200
add tests for figure_detection
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
import cv2
|
|
|
|
|
|
def remove_primary_text_regions(image):
|
|
"""Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
|
|
|
|
Args:
|
|
image: Image to remove primary text from.
|
|
|
|
Returns:
|
|
Image with primary text removed.
|
|
|
|
References:
|
|
https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background
|
|
"""
|
|
|
|
image = apply_threshold_to_image(image)
|
|
|
|
threshold_image = image.copy()
|
|
|
|
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 7)) # 20,3
|
|
close = cv2.morphologyEx(image, cv2.MORPH_CLOSE, close_kernel, iterations=1)
|
|
|
|
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 3)) # 5,3
|
|
dilate = cv2.dilate(close, dilate_kernel, iterations=1)
|
|
|
|
cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
|
cnts = filter(is_likely_primary_text_segment, cnts)
|
|
|
|
rects = map(cv2.boundingRect, cnts)
|
|
|
|
image = draw_bboxes(threshold_image, rects)
|
|
return image
|
|
|
|
|
|
def apply_threshold_to_image(image):
|
|
"""Converts an image to black and white."""
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
|
return cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
|
|
|
|
|
def is_likely_primary_text_segment(cnt):
|
|
x, y, w, h = cv2.boundingRect(cnt)
|
|
return 400 < cv2.contourArea(cnt) < 16000 or w / h > 3
|
|
|
|
|
|
def draw_bboxes(image, bboxes):
|
|
for rect in bboxes:
|
|
x, y, w, h = rect
|
|
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), -1)
|
|
return image
|