Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0027421628 |
@ -77,24 +77,29 @@ class Pipeline:
|
|||||||
|
|
||||||
|
|
||||||
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
||||||
"""Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash).
|
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
|
||||||
|
`allPassed` set to True.
|
||||||
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
||||||
"""
|
"""
|
||||||
seen = set()
|
keep = dict()
|
||||||
for item in metadata:
|
for image_meta in metadata:
|
||||||
key = (
|
key: tuple[int, int, int, int, int] = (
|
||||||
item["representation"],
|
image_meta["position"]["x1"],
|
||||||
item["position"]["x1"],
|
image_meta["position"]["x2"],
|
||||||
item["position"]["x2"],
|
image_meta["position"]["y1"],
|
||||||
item["position"]["y1"],
|
image_meta["position"]["y2"],
|
||||||
item["position"]["y2"],
|
image_meta["position"]["pageNumber"],
|
||||||
item["position"]["pageNumber"],
|
|
||||||
)
|
)
|
||||||
if key not in seen:
|
if key in keep:
|
||||||
seen.add(key)
|
|
||||||
yield item
|
|
||||||
else:
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}"
|
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
|
||||||
)
|
)
|
||||||
continue
|
if image_meta["filters"]["allPassed"]:
|
||||||
|
logger.warning("Setting the image with allPassed flag set to True")
|
||||||
|
keep[key] = image_meta
|
||||||
|
else:
|
||||||
|
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
|
||||||
|
else:
|
||||||
|
keep[key] = image_meta
|
||||||
|
|
||||||
|
yield from keep.values()
|
||||||
|
|||||||
@ -6,7 +6,12 @@ from image_prediction.pipeline import load_pipeline
|
|||||||
|
|
||||||
def test_all_duplicate_images_are_filtered():
|
def test_all_duplicate_images_are_filtered():
|
||||||
"""See RED-10765 (RM-241): Removed redactions reappear."""
|
"""See RED-10765 (RM-241): Removed redactions reappear."""
|
||||||
pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
|
pdf_path = (
|
||||||
|
Path(__file__).parents[1]
|
||||||
|
/ "data"
|
||||||
|
/ "RED-10765"
|
||||||
|
/ "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
|
||||||
|
)
|
||||||
pdf_bytes = pdf_path.read_bytes()
|
pdf_bytes = pdf_path.read_bytes()
|
||||||
|
|
||||||
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
|
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
|
||||||
@ -14,7 +19,17 @@ def test_all_duplicate_images_are_filtered():
|
|||||||
|
|
||||||
seen = set()
|
seen = set()
|
||||||
for prediction in predictions:
|
for prediction in predictions:
|
||||||
key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber'])
|
key = (
|
||||||
|
prediction["position"]["x1"],
|
||||||
|
prediction["position"]["x2"],
|
||||||
|
prediction["position"]["y1"],
|
||||||
|
prediction["position"]["y2"],
|
||||||
|
prediction["position"]["pageNumber"],
|
||||||
|
)
|
||||||
assert key not in seen, f"Duplicate found: {key}"
|
assert key not in seen, f"Duplicate found: {key}"
|
||||||
seen.add(key)
|
seen.add(key)
|
||||||
|
|
||||||
|
all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
|
||||||
|
assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
|
||||||
|
|
||||||
|
assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user