Pull request #35: RED-5527 update pdf2img with skip_pages_without_images flag, add ENV for this setting for table parsing
Merge in RR/cv-analysis from RED-5527 to master
Squashed commit of the following:
commit 1748095ce45f4a76e9d3d8547d9fc70f7deae2fb
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Nov 8 15:15:16 2022 +0100
RED-5527 update pdf2img with skip_pages_without_images flag, add ENV for this setting for table parsing
This commit is contained in:
parent
8c5a979cc8
commit
d14960da08
@ -8,6 +8,7 @@ def get_config():
|
||||
class Config:
|
||||
def __init__(self):
|
||||
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
|
||||
self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True)
|
||||
|
||||
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
|
||||
self.visual_logging_level = "DISABLED"
|
||||
|
||||
@ -11,16 +11,21 @@ from pdf2img.default_objects.image import ImagePlus, ImageInfo
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
|
||||
def get_analysis_pipeline(operation):
|
||||
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images):
|
||||
if operation == "table":
|
||||
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
|
||||
return make_analysis_pipeline(
|
||||
parse_tables,
|
||||
table_parsing_formatter,
|
||||
dpi=200,
|
||||
skip_pages_without_images=table_parsing_skip_pages_without_images,
|
||||
)
|
||||
elif operation == "figure":
|
||||
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def make_analysis_pipeline(analysis_fn, formatter, dpi):
|
||||
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
|
||||
def analyse_pipeline(pdf: bytes, index=None):
|
||||
def parse_page(page: ImagePlus):
|
||||
image = page.asarray()
|
||||
@ -30,7 +35,7 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi):
|
||||
infos = formatter(rects, page, dpi)
|
||||
return infos
|
||||
|
||||
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
|
||||
pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images)
|
||||
results = map(parse_page, pages)
|
||||
|
||||
yield from flatten(filter(truth, results))
|
||||
|
||||
@ -1 +1 @@
|
||||
Subproject commit f7292c30ad7c7ae5f07cee6925adda096301b60a
|
||||
Subproject commit eb00e92329bc6c873bd9836a9cff8dd4252de030
|
||||
@ -3,6 +3,7 @@ import json
|
||||
import logging
|
||||
from operator import itemgetter
|
||||
|
||||
from cv_analysis.config import get_config
|
||||
from cv_analysis.server.pipeline import get_analysis_pipeline
|
||||
from cv_analysis.utils.banner import make_art
|
||||
from pyinfra import config as pyinfra_config
|
||||
@ -10,6 +11,7 @@ from pyinfra.queue.queue_manager import QueueManager
|
||||
from pyinfra.storage.storage import get_storage
|
||||
|
||||
PYINFRA_CONFIG = pyinfra_config.get_config()
|
||||
CV_CONFIG = get_config()
|
||||
|
||||
logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root)
|
||||
|
||||
@ -29,7 +31,7 @@ def analysis_callback(queue_message: dict):
|
||||
should_publish_result = True
|
||||
|
||||
object_bytes = gzip.decompress(storage.get_object(bucket, object_name))
|
||||
analysis_fn = get_analysis_pipeline(operation)
|
||||
analysis_fn = get_analysis_pipeline(operation, CV_CONFIG.table_parsing_skip_pages_without_images)
|
||||
|
||||
results = analysis_fn(object_bytes)
|
||||
response = {**queue_message, "data": list(results)}
|
||||
|
||||
@ -59,6 +59,6 @@ def formatter(operation):
|
||||
|
||||
@pytest.mark.parametrize("operation", ["table", "figure"])
|
||||
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
|
||||
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200)
|
||||
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False)
|
||||
results = list(analysis_pipeline(empty_pdf))
|
||||
assert list(results) == expected_formatted_analysis_result
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user