Julius Unverfehrt 4102a564a3 fix: add small image filter logic
Introduces a preprocessing that scans each page for page sized images.
If one is encountered, all images that are below a configured ratio in
respect to the page size are dropped.

This step has to occur before the image stiching logic, but MIGHT
introduce the problem of dropping image parts that might constitue an
image. This hoever is not solveable since we want to drop the small
images before further processing since the faulty character images are
also stiched to a valid image, that in reality isn't an image.
2024-08-06 16:52:05 +02:00

39 lines
1.3 KiB
TOML

[logging]
level = "INFO"
[service]
# Print document processing progress to stdout
verbose = false
batch_size = 16
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
# The filter result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
# specified required value.
[filters.confidence]
# Minimum permissible prediction confidence
min = 0.5
# Image size to page size ratio (ratio of geometric means of areas)
[filters.image_to_page_quotient]
min = 0.05
max = 0.75
[filters.is_scanned_page]
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
tolerance = 0
# Image width to height ratio
[filters.image_width_to_height_quotient]
min = 0.1
max = 10
# put class specific filters here ['signature', 'formula', 'logo']
[filters.overrides.signature.image_to_page_quotient]
max = 0.4