Pull request #35: RED-5527 update pdf2img with skip_pages_without_images flag, add ENV for this setting for table parsing

Merge in RR/cv-analysis from RED-5527 to master

Squashed commit of the following:

commit 1748095ce45f4a76e9d3d8547d9fc70f7deae2fb
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Nov 8 15:15:16 2022 +0100

    RED-5527 update pdf2img with skip_pages_without_images flag, add ENV for this setting for table parsing
This commit is contained in:
Julius Unverfehrt 2022-11-08 15:16:53 +01:00
parent 8c5a979cc8
commit d14960da08
5 changed files with 15 additions and 7 deletions

View File

@ -8,6 +8,7 @@ def get_config():
class Config:
def __init__(self):
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True)
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
self.visual_logging_level = "DISABLED"

View File

@ -11,16 +11,21 @@ from pdf2img.default_objects.image import ImagePlus, ImageInfo
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation):
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images):
if operation == "table":
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
return make_analysis_pipeline(
parse_tables,
table_parsing_formatter,
dpi=200,
skip_pages_without_images=table_parsing_skip_pages_without_images,
)
elif operation == "figure":
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
else:
raise
def make_analysis_pipeline(analysis_fn, formatter, dpi):
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
@ -30,7 +35,7 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi):
infos = formatter(rects, page, dpi)
return infos
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images)
results = map(parse_page, pages)
yield from flatten(filter(truth, results))

@ -1 +1 @@
Subproject commit f7292c30ad7c7ae5f07cee6925adda096301b60a
Subproject commit eb00e92329bc6c873bd9836a9cff8dd4252de030

View File

@ -3,6 +3,7 @@ import json
import logging
from operator import itemgetter
from cv_analysis.config import get_config
from cv_analysis.server.pipeline import get_analysis_pipeline
from cv_analysis.utils.banner import make_art
from pyinfra import config as pyinfra_config
@ -10,6 +11,7 @@ from pyinfra.queue.queue_manager import QueueManager
from pyinfra.storage.storage import get_storage
PYINFRA_CONFIG = pyinfra_config.get_config()
CV_CONFIG = get_config()
logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root)
@ -29,7 +31,7 @@ def analysis_callback(queue_message: dict):
should_publish_result = True
object_bytes = gzip.decompress(storage.get_object(bucket, object_name))
analysis_fn = get_analysis_pipeline(operation)
analysis_fn = get_analysis_pipeline(operation, CV_CONFIG.table_parsing_skip_pages_without_images)
results = analysis_fn(object_bytes)
response = {**queue_message, "data": list(results)}

View File

@ -59,6 +59,6 @@ def formatter(operation):
@pytest.mark.parametrize("operation", ["table", "figure"])
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200)
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False)
results = list(analysis_pipeline(empty_pdf))
assert list(results) == expected_formatted_analysis_result