diff --git a/cv_analysis/config.py b/cv_analysis/config.py index 550c2ee..7dea359 100644 --- a/cv_analysis/config.py +++ b/cv_analysis/config.py @@ -8,6 +8,7 @@ def get_config(): class Config: def __init__(self): self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO") + self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True) # visual_logging_level: NOTHING > INFO > DEBUG > ALL self.visual_logging_level = "DISABLED" diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py index bca7f0c..01aa05e 100644 --- a/cv_analysis/server/pipeline.py +++ b/cv_analysis/server/pipeline.py @@ -11,16 +11,21 @@ from pdf2img.default_objects.image import ImagePlus, ImageInfo from pdf2img.default_objects.rectangle import RectanglePlus -def get_analysis_pipeline(operation): +def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images): if operation == "table": - return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200) + return make_analysis_pipeline( + parse_tables, + table_parsing_formatter, + dpi=200, + skip_pages_without_images=table_parsing_skip_pages_without_images, + ) elif operation == "figure": return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200) else: raise -def make_analysis_pipeline(analysis_fn, formatter, dpi): +def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False): def analyse_pipeline(pdf: bytes, index=None): def parse_page(page: ImagePlus): image = page.asarray() @@ -30,7 +35,7 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi): infos = formatter(rects, page, dpi) return infos - pages = convert_pages_to_images(pdf, index=index, dpi=dpi) + pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images) results = map(parse_page, pages) yield from flatten(filter(truth, results)) diff --git a/incl/pdf2image b/incl/pdf2image index f7292c3..eb00e92 160000 --- a/incl/pdf2image +++ b/incl/pdf2image @@ -1 +1 @@ -Subproject commit f7292c30ad7c7ae5f07cee6925adda096301b60a +Subproject commit eb00e92329bc6c873bd9836a9cff8dd4252de030 diff --git a/src/serve.py b/src/serve.py index 8fd2fc9..81405bd 100644 --- a/src/serve.py +++ b/src/serve.py @@ -3,6 +3,7 @@ import json import logging from operator import itemgetter +from cv_analysis.config import get_config from cv_analysis.server.pipeline import get_analysis_pipeline from cv_analysis.utils.banner import make_art from pyinfra import config as pyinfra_config @@ -10,6 +11,7 @@ from pyinfra.queue.queue_manager import QueueManager from pyinfra.storage.storage import get_storage PYINFRA_CONFIG = pyinfra_config.get_config() +CV_CONFIG = get_config() logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root) @@ -29,7 +31,7 @@ def analysis_callback(queue_message: dict): should_publish_result = True object_bytes = gzip.decompress(storage.get_object(bucket, object_name)) - analysis_fn = get_analysis_pipeline(operation) + analysis_fn = get_analysis_pipeline(operation, CV_CONFIG.table_parsing_skip_pages_without_images) results = analysis_fn(object_bytes) response = {**queue_message, "data": list(results)} diff --git a/test/unit_tests/server_pipeline_test.py b/test/unit_tests/server_pipeline_test.py index a5e809b..b0f9a28 100644 --- a/test/unit_tests/server_pipeline_test.py +++ b/test/unit_tests/server_pipeline_test.py @@ -59,6 +59,6 @@ def formatter(operation): @pytest.mark.parametrize("operation", ["table", "figure"]) def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result): - analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200) + analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False) results = list(analysis_pipeline(empty_pdf)) assert list(results) == expected_formatted_analysis_result