fix(pdf conversion): repair broken bad x-ref handling

This commit is contained in:
Julius Unverfehrt 2024-02-08 17:04:03 +01:00
parent dadc0a4163
commit 6163e29d6b
4 changed files with 12 additions and 120 deletions

View File

@ -92,12 +92,12 @@ def get_images_on_page(doc, metadata):
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
return compose(
list,
partial(add_alpha_channel_info, doc),
filter_valid_metadata,
get_metadata_for_images_on_page,
)(page)
metadata = get_metadata_for_images_on_page(page)
metadata = filter_valid_metadata(metadata)
metadata = add_alpha_channel_info(doc, metadata)
return list(metadata)
def get_metadata_for_images_on_page(page: fitz.Page):
@ -207,7 +207,11 @@ def add_alpha_channel_info(doc, metadata):
@lru_cache(maxsize=None)
def load_image_handle_from_xref(doc, xref):
return doc.extract_image(xref)
try:
return doc.extract_image(xref)
except ValueError:
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
return
rounder = rcompose(round, int)

View File

@ -1,10 +1,3 @@
import logging
import pytest
from image_prediction.utils import get_logger
pytest_plugins = [
"test.fixtures.extractor",
"test.fixtures.image",
@ -17,14 +10,5 @@ pytest_plugins = [
"test.fixtures.parameters",
"test.fixtures.pdf",
"test.fixtures.target",
"test.unit_tests.image_stitching_test"
"test.unit_tests.image_stitching_test",
]
@pytest.fixture(autouse=True)
def mute_logger():
logger = get_logger()
level = logger.level
logger.setLevel(logging.CRITICAL + 1)
yield
logger.setLevel(level)

View File

@ -1,48 +0,0 @@
import tempfile
import pytest
import yaml
from image_prediction.config import Config
@pytest.fixture
def config_file_content():
return {"A": [{"B": [1, 2]}, {"C": 3}, 4], "D": {"E": {"F": True}}}
@pytest.fixture
def config(config_file_content):
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f:
yaml.dump(config_file_content, f, default_flow_style=False)
yield Config(f.name)
def test_dot_access_key_exists(config):
assert config.A == [{"B": [1, 2]}, {"C": 3}, 4]
assert config.D.E["F"]
def test_access_key_exists(config):
assert config["A"] == [{"B": [1, 2]}, {"C": 3}, 4]
assert config["A"][0] == {"B": [1, 2]}
assert config["A"][0]["B"] == [1, 2]
assert config["A"][0]["B"][0] == 1
def test_dot_access_key_does_not_exists(config):
assert config.B is None
def test_access_key_does_not_exists(config):
assert config["B"] is None
def test_get_method_returns_key_if_key_does_exist(config):
dot_indexable = config.D.E
assert dot_indexable.get("F", "default_value") is True
def test_get_method_returns_default_if_key_does_not_exist(config):
dot_indexable = config.D.E
assert dot_indexable.get("X", "default_value") == "default_value"

View File

@ -1,48 +0,0 @@
import json
import pytest
from image_prediction.exceptions import IntentionalTestException
from image_prediction.flask import make_prediction_server
def predict_fn(x: bytes):
x = int(x.decode())
if x == 42:
return True
else:
raise IntentionalTestException("This is intended.")
@pytest.fixture
def server():
server = make_prediction_server(predict_fn)
server.config.update({"TESTING": True})
return server
@pytest.fixture
def client(server):
return server.test_client()
def test_server_predict_success(client, mute_logger):
response = client.post("/predict", data="42")
assert json.loads(response.data)
def test_server_predict_failure(client, mute_logger):
response = client.post("/predict", data="13")
assert response.status_code == 500
def test_server_health_check(client):
response = client.get("/health")
assert response.status_code == 200
assert response.json == "OK"
def test_server_ready_check(client):
response = client.get("/ready")
assert response.status_code == 200
assert response.json == "OK"