Merge in RR/cv-analysis from add-pdf2image-module to master
Squashed commit of the following:
commit 13355e2dd006fae9ee05c2d00acbbc8b38fd1e8e
Merge: eaf4627 edbda58
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 13:35:27 2022 +0200
Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into add-pdf2image-module
commit eaf462768787642889d496203034d017c4ec959b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 13:26:58 2022 +0200
update build scripts
commit d429c713f4e5e74afca81c2354e8125bf389b865
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 13:11:07 2022 +0200
purge target
commit 349b81c5db724bf70d6f31b58ded2b5414216bfe
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 13:07:58 2022 +0200
Revert "extinguish target"
This reverts commit d2bd4cefde0648d2487839b0344509b984435273.
commit d2bd4cefde0648d2487839b0344509b984435273
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 12:57:50 2022 +0200
extinguish target
commit 5f6cc713db31e3e16c8e7f13a59804c86b5d77d7
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 11:58:52 2022 +0200
refactor
commit 576019378a39b580b816d9eb7957774f1faf48b9
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 11:52:04 2022 +0200
add test for adjustesd server analysis pipeline logic
commit bdf0121929d6941cbba565055f37df7970925c79
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 11:30:17 2022 +0200
update analysis pipline logic to use imported pdf2image
commit f7cef98d5e6d7b95517bbd047dd3e958acebb3d8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 2 11:04:34 2022 +0200
add pdf2image as git submodule
59 lines
2.1 KiB
Python
59 lines
2.1 KiB
Python
from functools import partial
|
|
from itertools import starmap
|
|
from operator import truth
|
|
from typing import Callable, Iterator
|
|
|
|
from funcy import lmap
|
|
|
|
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
|
|
from cv_analysis.layout_parsing import parse_layout
|
|
from cv_analysis.table_parsing import parse_tables
|
|
from cv_analysis.utils.structures import Rectangle
|
|
from pdf2img.conversion import convert_pdf_to_image_and_metadata_stream
|
|
|
|
|
|
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
|
|
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
|
|
The pipeline streams dicts containing page information and the analysis results.
|
|
Note:
|
|
If there are no results on a page, the page is skipped in result stream
|
|
Steps:
|
|
Convert PDF to a stream of page as image and metadata (page information) tuples
|
|
Analyse pages:
|
|
Get list of bounding boxes per page (e.g. table cells)
|
|
Convert pixel values to inches
|
|
Format results
|
|
"""
|
|
|
|
def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
|
|
image_metadata_stream = convert_pdf_to_image_and_metadata_stream(pdf, index=index, dpi=dpi)
|
|
results = starmap(analyse_image_metadata_pair, image_metadata_stream)
|
|
yield from filter(truth, results)
|
|
|
|
def analyse_image_metadata_pair(image, metadata):
|
|
rectangles = analysis_fn(image)
|
|
rectangles = map(partial(convert_pixel_rect_to_inches_rect, dpi=dpi), rectangles)
|
|
bboxes = lmap(lambda x: x.json_full(), rectangles)
|
|
return {**metadata, "bboxes": bboxes} if bboxes else {}
|
|
|
|
return analysis_pipeline
|
|
|
|
|
|
def get_analysis_fn(analysis_type):
|
|
if analysis_type == "table":
|
|
return parse_tables
|
|
elif analysis_type == "layout":
|
|
return parse_layout
|
|
elif analysis_type == "figure":
|
|
return make_figure_detection_pipeline()
|
|
else:
|
|
raise
|
|
|
|
|
|
def convert_pixel_rect_to_inches_rect(rect, dpi):
|
|
def pixel_to_inch(pixel):
|
|
return pixel / dpi * 72
|
|
|
|
bbox_inches = tuple(map(pixel_to_inch, rect.xyxy()))
|
|
return Rectangle.from_xyxy(bbox_inches, discrete=False)
|