Pull request #23: Add pdf2image module

Merge in RR/cv-analysis from add-pdf2image-module to master

Squashed commit of the following:

commit 13355e2dd006fae9ee05c2d00acbbc8b38fd1e8e
Merge: eaf4627 edbda58
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 13:35:27 2022 +0200

    Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into add-pdf2image-module

commit eaf462768787642889d496203034d017c4ec959b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 13:26:58 2022 +0200

    update build scripts

commit d429c713f4e5e74afca81c2354e8125bf389b865
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 13:11:07 2022 +0200

    purge target

commit 349b81c5db724bf70d6f31b58ded2b5414216bfe
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 13:07:58 2022 +0200

    Revert "extinguish target"

    This reverts commit d2bd4cefde0648d2487839b0344509b984435273.

commit d2bd4cefde0648d2487839b0344509b984435273
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 12:57:50 2022 +0200

    extinguish target

commit 5f6cc713db31e3e16c8e7f13a59804c86b5d77d7
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 11:58:52 2022 +0200

    refactor

commit 576019378a39b580b816d9eb7957774f1faf48b9
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 11:52:04 2022 +0200

    add test for adjustesd server analysis pipeline logic

commit bdf0121929d6941cbba565055f37df7970925c79
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 11:30:17 2022 +0200

    update analysis pipline logic to use imported pdf2image

commit f7cef98d5e6d7b95517bbd047dd3e958acebb3d8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 2 11:04:34 2022 +0200

    add pdf2image as git submodule
This commit is contained in:
Julius Unverfehrt 2022-08-02 13:36:50 +02:00
parent edbda58837
commit 016abe46de
16 changed files with 78 additions and 314 deletions

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "incl/pyinfra"]
path = incl/pyinfra
url = ssh://git@git.iqser.com:2222/rr/pyinfra.git
[submodule "incl/pdf2image"]
path = incl/pdf2image
url = ssh://git@git.iqser.com:2222/rr/pdf2image.git

View File

@ -13,8 +13,13 @@ RUN python3 -m pip install -r requirements.txt
COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
RUN python -m pip install -r incl/pyinfra/requirements.txt
COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt
RUN python -m pip install -r incl/pdf2image/requirements.txt
COPY ./incl ./incl
RUN python3 -m pip install -e incl/pyinfra
RUN python3 -m pip install -e incl/pdf2image
COPY ./src ./src
COPY ./cv_analysis ./cv_analysis

View File

@ -12,6 +12,9 @@ echo "dev setup for unit test and coverage"
pip install -e incl/pyinfra
pip install -r incl/pyinfra/requirements.txt
pip install -e incl/pdf2image
pip install -r incl/pdf2image/requirements.txt
pip install -e .
pip install -r requirements.txt

View File

@ -1,19 +0,0 @@
#!/bin/bash
set -e
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
then
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
versions:set \
-DnewVersion=${bamboo_version_tag}
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
-B clean deploy \
-e -DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
fi

View File

@ -1,19 +0,0 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
SERVICE_NAME_BASE=$2
# TODO version tag on master push
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install dvc
pip install 'dvc[ssh]'
dvc pull
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}

View File

@ -1,9 +0,0 @@
#!/bin/bash
set -e
if [[ "${bamboo_version_tag}" = "dev" ]]
then
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
else
echo "gitTag=${bamboo_version_tag}" > git.tag
fi

View File

@ -1,61 +0,0 @@
#!/bin/bash
set -e
export JAVA_HOME=/usr/bin/sonar-scanner/jre
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install -e .
pip install -e incl/pyinfra
pip install -r incl/pyinfra/requirements.txt
pip install -r requirements.txt
echo "DVC pull step"
dvc pull
echo "coverage calculation"
coverage run -m pytest test
echo "coverage report generation"
coverage report -m
coverage xml
SERVICE_NAME=$1
echo "dependency-check:aggregate"
mkdir -p reports
dependency-check --enableExperimental -f JSON -f HTML -f XML \
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
/usr/bin/sonar-scanner/bin/sonar-scanner -X\
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=src,cv_analysis \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
/usr/bin/sonar-scanner/bin/sonar-scanner \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=src,cv_analysis \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
fi

View File

@ -1,44 +1,42 @@
from functools import partial
from typing import Callable
from itertools import starmap
from operator import truth
from typing import Callable, Iterator
from funcy import lmap
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.server.rotate import rotate_rectangle
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
from cv_analysis.utils.structures import Rectangle
from pdf2img.conversion import convert_pdf_to_image_and_metadata_stream
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
The pipeline returns a Generator of dicts containing page information and the analysis results.
The pipeline streams dicts containing page information and the analysis results.
Note:
If there are no results on a page, the page is skipped in result stream
Steps:
Convert PDF to pairs of image and page information
Analyse pages, get list of bounding boxes per page (e.g. table cells)
Convert PDF to a stream of page as image and metadata (page information) tuples
Analyse pages:
Get list of bounding boxes per page (e.g. table cells)
Convert pixel values to inches
Rotate results if page is rotated
Format results to stream of dictionaries with page information and analysis results
Format results
"""
def pipeline(pdf: bytes, index=None):
image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi)
results = map(image_metadata_pair_to_results, image_metadata_pairs)
results_filtered = filter(lambda x: x["bboxes"], results)
return results_filtered
def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
image_metadata_stream = convert_pdf_to_image_and_metadata_stream(pdf, index=index, dpi=dpi)
results = starmap(analyse_image_metadata_pair, image_metadata_stream)
yield from filter(truth, results)
def image_metadata_pair_to_results(image_metadata_pair):
rectangles = analysis_fn(image_metadata_pair.image)
rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles)
if image_metadata_pair.metadata["rotation"] != 0:
rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata)
rectangles = map(rotate_rectangle_fn, rectangles)
bboxes = lmap(lambda x: x.json_xyxy(), rectangles)
return {**image_metadata_pair.metadata, "bboxes": bboxes}
def analyse_image_metadata_pair(image, metadata):
rectangles = analysis_fn(image)
rectangles = map(partial(convert_pixel_rect_to_inches_rect, dpi=dpi), rectangles)
bboxes = lmap(lambda x: x.json_full(), rectangles)
return {**metadata, "bboxes": bboxes} if bboxes else {}
return pipeline
return analysis_pipeline
def get_analysis_fn(analysis_type):
@ -52,10 +50,9 @@ def get_analysis_fn(analysis_type):
raise
def pixel_rect_to_inches_rect(rect, dpi):
def convert_pixel_to_inch(pixel):
def convert_pixel_rect_to_inches_rect(rect, dpi):
def pixel_to_inch(pixel):
return pixel / dpi * 72
bbox = rect.x1, rect.y1, rect.x2, rect.y2
bbox_inches = tuple(map(convert_pixel_to_inch, bbox))
bbox_inches = tuple(map(pixel_to_inch, rect.xyxy()))
return Rectangle.from_xyxy(bbox_inches, discrete=False)

View File

@ -1,107 +0,0 @@
from _operator import itemgetter
import numpy as np
from cv_analysis.utils.structures import Rectangle
def rotate_rectangle(rectangle, metadata):
width, height, rotation = itemgetter("width", "height", "rotation")(metadata)
rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation
if rotation in [1, 3]:
width, height = height, width
x1, y1, x2, y2 = rectangle.xyxy()
matrix = np.vstack([[x1, y1], [x2, y2]]).T
new_matrix = rotate_and_shift(matrix, rotation, (width, height))
x1, x2 = sorted(new_matrix[0, :])
y1, y2 = sorted(new_matrix[1, :])
return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False)
def rotate_and_shift(matrix, rotation, size, debug=False):
"""Rotates a matrix against (!) a specified rotation. That is, the rotation is applied negatively. The matrix is
also shifted to ensure it contains points (columns) in quadrant I.
Procedure:
1) Rotate the matrix clockwise according to rotation value
2) Shift the matrix back into quadrant I
3) Set x_i and y_i to new lower left and upper right corners, since the corner vectors are no longer at these
corners due to the rotation
Args:
matrix: matrix to transform
rotation: any of 0, 1, 2, or 3, where 1 = 90 degree CLOCKWISE rotation etc.
size: the size of the page as a tuple (<width>, <height>)
debug: Visualizes the transformations for later re-understanding of the code
"""
def shift_to_quadrant_1(matrix):
# TODO: generalize
if rotation == 0:
back_shift = np.zeros_like(np.eye(2))
elif rotation == 1:
back_shift = np.array([[0, 0], [1, 1]]) * size[1]
elif rotation == 2:
back_shift = np.array([[1, 1], [1, 1]]) * size
elif rotation == 3:
back_shift = np.array([[1, 1], [0, 0]]) * size[0]
else:
raise ValueError(f"Unexpected rotation value '{rotation}'. Expected any of 0, 1, 2, or 3.")
matrix_shifted = matrix + back_shift
return matrix_shifted
# PDF rotations are clockwise, hence subtract the radian value of the rotation from 2 pi
radians = (2 * np.pi) - (np.pi * (rotation / 2))
matrix_rotated = rotate(matrix, radians)
matrix_rotated_and_shifted = shift_to_quadrant_1(matrix_rotated)
if debug:
__show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted)
return matrix_rotated_and_shifted
def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted):
import matplotlib.pyplot as plt
from copy import deepcopy
m1 = matrix
m2 = matrix_rotated
m3 = matrix_rotated_and_shifted
m1, m2, m3 = map(deepcopy, (m1, m2, m3))
frame = np.eye(2) * size
frame_rotated = rotate(frame, radians)
f1 = frame
f2 = frame_rotated
f1 *= 0.005 * 1
f2 *= 0.005 * 1
m1 *= 0.005 * 1
m2 *= 0.005 * 1
m3 *= 0.005 * 1
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
axes = axes.ravel()
axes[0].quiver([0, 0], [0, 0], f1[0, :], f1[1, :], scale=5, scale_units="inches", color="red")
axes[1].quiver([0, 0], [0, 0], f2[0, :], f2[1, :], scale=5, scale_units="inches", color="red")
axes[0].quiver([0, 0], [0, 0], m1[0, :], m1[1, :], scale=5, scale_units="inches")
axes[1].quiver([0, 0], [0, 0], m2[0, :], m2[1, :], scale=5, scale_units="inches", color="green")
axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue")
plt.show()
def rotate(input_matrix, radians):
rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
return np.dot(rotation_matrix, input_matrix)

View File

@ -1,46 +0,0 @@
from dataclasses import dataclass
from functools import partial
from typing import Iterator
import fitz
import numpy as np
@dataclass
class ImageMetadataPair:
image: np.ndarray
metadata: dict
def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
"""Streams PDF as pairs of image (matrix) and metadata.
Note: If Index is not given or evaluates to None, the whole PDF will be processed.
"""
convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
yield from map(convert_fn, stream_pages(pdf, index))
def page_to_image_metadata_pair(page: fitz.Page, dpi):
metadata = get_page_info(page)
pixmap = page.get_pixmap(dpi=dpi)
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
return ImageMetadataPair(array, metadata)
def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
with fitz.open(stream=pdf) as pdf_handle:
if not index:
yield from pdf_handle
else:
for i in index:
yield pdf_handle[i]
def get_page_info(page):
return {
"index": page.number,
"rotation": page.rotation,
"width": page.rect.width, # rotated page width in inches
"height": page.rect.height, # rotated page height in inches
}

1
incl/pdf2image Submodule

@ -0,0 +1 @@
Subproject commit d1a68b9e580ecbc0cd3050deeedc2d648b377232

@ -1 +1 @@
Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9
Subproject commit 0f6512df5423df98d334f5735170cd1f7642998a

View File

@ -1,24 +0,0 @@
import fitz
import numpy as np
import pytest
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
@pytest.fixture
def pdf(n_pages):
doc = fitz.open()
for n in range(n_pages):
page = doc.new_page()
where = fitz.Point(50, 100)
page.insert_text(where, "De gustibus non est disputandum.", fontsize=30)
return doc.write()
@pytest.mark.parametrize("n_pages", [1])
def test_pdf_to_array_and_metadata(pdf):
for image_metadata_pair in pdf_to_image_metadata_pairs(pdf):
assert isinstance(image_metadata_pair.image, np.ndarray)
assert image_metadata_pair.image.shape == (2339, 1653, 3) # Height, Width, Color channels
assert isinstance(image_metadata_pair.metadata, dict)

View File

@ -0,0 +1,40 @@
import fitz
import numpy as np
import pytest
from cv_analysis.server.pipeline import make_analysis_pipeline
from cv_analysis.utils.structures import Rectangle
def analysis_fn_mock(image: np.ndarray):
bbox = (0, 0, 42, 42)
return [Rectangle.from_xyxy(bbox)]
@pytest.fixture
def empty_pdf(n_pages):
doc = fitz.open()
for n in range(n_pages):
doc.new_page()
return doc.write()
@pytest.fixture
def expected_formatted_analysis_result(n_pages):
return [
{
"pageNumber": page_number,
"rotation": 0,
"width": 595.0,
"height": 842.0,
"bboxes": [{"x1": 0.0, "y1": 0.0, "x2": 15.12, "y2": 15.12, "width": 15.12, "height": 15.12}],
}
for page_number in range(n_pages)
]
@pytest.mark.parametrize("n_pages", [1, 2])
def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result):
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock)
results = analysis_pipeline(empty_pdf)
assert list(results) == expected_formatted_analysis_result