Pull request #23: Add pdf2image module

Merge in RR/cv-analysis from add-pdf2image-module to master Squashed commit of the following: commit 13355e2dd006fae9ee05c2d00acbbc8b38fd1e8e Merge: eaf4627 edbda58 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 13:35:27 2022 +0200 Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into add-pdf2image-module commit eaf462768787642889d496203034d017c4ec959b Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 13:26:58 2022 +0200 update build scripts commit d429c713f4e5e74afca81c2354e8125bf389b865 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 13:11:07 2022 +0200 purge target commit 349b81c5db724bf70d6f31b58ded2b5414216bfe Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 13:07:58 2022 +0200 Revert "extinguish target" This reverts commit d2bd4cefde0648d2487839b0344509b984435273. commit d2bd4cefde0648d2487839b0344509b984435273 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 12:57:50 2022 +0200 extinguish target commit 5f6cc713db31e3e16c8e7f13a59804c86b5d77d7 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 11:58:52 2022 +0200 refactor commit 576019378a39b580b816d9eb7957774f1faf48b9 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 11:52:04 2022 +0200 add test for adjustesd server analysis pipeline logic commit bdf0121929d6941cbba565055f37df7970925c79 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 11:30:17 2022 +0200 update analysis pipline logic to use imported pdf2image commit f7cef98d5e6d7b95517bbd047dd3e958acebb3d8 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 2 11:04:34 2022 +0200 add pdf2image as git submodule
2022-08-02 13:36:50 +02:00 · 2022-08-02 13:36:50 +02:00 · 016abe46de
commit 016abe46de
parent edbda58837
16 changed files with 78 additions and 314 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,6 @@
 [submodule "incl/pyinfra"]
 	path = incl/pyinfra
 	url = ssh://git@git.iqser.com:2222/rr/pyinfra.git
+[submodule "incl/pdf2image"]
+	path = incl/pdf2image
+	url = ssh://git@git.iqser.com:2222/rr/pdf2image.git
--- a/5
+++ b/5
@ -13,8 +13,13 @@ RUN python3 -m pip install -r requirements.txt
 COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
 RUN python -m pip install -r incl/pyinfra/requirements.txt

+COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt
+RUN python -m pip install -r incl/pdf2image/requirements.txt
+
 COPY ./incl ./incl
+
 RUN python3 -m pip install -e incl/pyinfra
+RUN python3 -m pip install -e incl/pdf2image

 COPY ./src ./src
 COPY ./cv_analysis ./cv_analysis
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -12,6 +12,9 @@ echo "dev setup for unit test and coverage"
 pip install -e incl/pyinfra
 pip install -r incl/pyinfra/requirements.txt

+pip install -e incl/pdf2image
+pip install -r incl/pdf2image/requirements.txt
+
 pip install -e .
 pip install -r requirements.txt

--- a/bamboo-specs/target/classes/buildjob/PlanSpec.class
+++ b/bamboo-specs/target/classes/buildjob/PlanSpec.class
--- a/bamboo-specs/target/classes/scripts/create-licence.sh
+++ b/bamboo-specs/target/classes/scripts/create-licence.sh
@ -1,19 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
-then
-    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-                    -f ${bamboo_build_working_directory}/pom.xml \
-                    versions:set  \
-                    -DnewVersion=${bamboo_version_tag}
-
-    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-                    -f ${bamboo_build_working_directory}/pom.xml \
-                    -B clean deploy \
-                    -e -DdeployAtEnd=true \
-                    -Dmaven.wagon.http.ssl.insecure=true \
-                    -Dmaven.wagon.http.ssl.allowall=true \
-                    -Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-                    -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
-fi
--- a/bamboo-specs/target/classes/scripts/docker-build.sh
+++ b/bamboo-specs/target/classes/scripts/docker-build.sh
@ -1,19 +0,0 @@
-#!/bin/bash
-set -e
-
-SERVICE_NAME=$1
-SERVICE_NAME_BASE=$2
-# TODO version tag on master push
-python3 -m venv build_venv
-source build_venv/bin/activate
-python3 -m pip install --upgrade pip
-
-pip install dvc
-pip install 'dvc[ssh]'
-dvc pull
-
-echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
-docker build -f Dockerfile_base  -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
-docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
-echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
-docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
--- a/bamboo-specs/target/classes/scripts/git-tag.sh
+++ b/bamboo-specs/target/classes/scripts/git-tag.sh
@ -1,9 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ "${bamboo_version_tag}" = "dev" ]]
-then
-    echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
-else
-    echo "gitTag=${bamboo_version_tag}" > git.tag
-fi
--- a/bamboo-specs/target/classes/scripts/sonar-scan.sh
+++ b/bamboo-specs/target/classes/scripts/sonar-scan.sh
@ -1,61 +0,0 @@
-#!/bin/bash
-set -e
-
-export JAVA_HOME=/usr/bin/sonar-scanner/jre
-
-python3 -m venv build_venv
-source build_venv/bin/activate
-python3 -m pip install --upgrade pip
-
-pip install -e .
-pip install -e incl/pyinfra
-
-pip install -r incl/pyinfra/requirements.txt
-pip install -r requirements.txt
-
-echo "DVC pull step"
-dvc pull
-
-echo "coverage calculation"
-coverage run -m pytest test
-echo "coverage report generation"
-coverage report -m
-coverage xml
-
-SERVICE_NAME=$1
-
-echo "dependency-check:aggregate"
-mkdir -p reports
-dependency-check --enableExperimental -f JSON -f HTML -f XML \
-  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
-  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
-
-if [[ -z "${bamboo_repository_pr_key}" ]]
-then
-    echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
-    /usr/bin/sonar-scanner/bin/sonar-scanner -X\
-      -Dsonar.projectKey=RED_$SERVICE_NAME \
-      -Dsonar.sources=src,cv_analysis \
-      -Dsonar.host.url=https://sonarqube.iqser.com \
-      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-      -Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
-
-else
-    echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
-    /usr/bin/sonar-scanner/bin/sonar-scanner \
-      -Dsonar.projectKey=RED_$SERVICE_NAME \
-      -Dsonar.sources=src,cv_analysis \
-      -Dsonar.host.url=https://sonarqube.iqser.com \
-      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-      -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-      -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-      -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
-fi
--- a/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class
+++ b/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class
--- a/cv_analysis/server/pipeline.py
+++ b/cv_analysis/server/pipeline.py
@ -1,44 +1,42 @@
 from functools import partial
-from typing import Callable
+from itertools import starmap
+from operator import truth
+from typing import Callable, Iterator

 from funcy import lmap

 from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
 from cv_analysis.layout_parsing import parse_layout
-from cv_analysis.server.rotate import rotate_rectangle
 from cv_analysis.table_parsing import parse_tables
-from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
 from cv_analysis.utils.structures import Rectangle
+from pdf2img.conversion import convert_pdf_to_image_and_metadata_stream


 def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
    """Make end-to-end pipeline to analyse a PDF with given analysis function.
-    The pipeline returns a Generator of dicts containing page information and the analysis results.
-
+    The pipeline streams dicts containing page information and the analysis results.
+    Note:
+        If there are no results on a page, the page is skipped in result stream
    Steps:
-        Convert PDF to pairs of image and page information
-        Analyse pages, get list of bounding boxes per page (e.g. table cells)
+        Convert PDF to a stream of page as image and metadata (page information) tuples
+        Analyse pages:
+            Get list of bounding boxes per page (e.g. table cells)
            Convert pixel values to inches
-        Rotate results if page is rotated
-        Format results to stream of dictionaries with page information and analysis results
+            Format results
    """

-    def pipeline(pdf: bytes, index=None):
-        image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi)
-        results = map(image_metadata_pair_to_results, image_metadata_pairs)
-        results_filtered = filter(lambda x: x["bboxes"], results)
-        return results_filtered
+    def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
+        image_metadata_stream = convert_pdf_to_image_and_metadata_stream(pdf, index=index, dpi=dpi)
+        results = starmap(analyse_image_metadata_pair, image_metadata_stream)
+        yield from filter(truth, results)

-    def image_metadata_pair_to_results(image_metadata_pair):
-        rectangles = analysis_fn(image_metadata_pair.image)
-        rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles)
-        if image_metadata_pair.metadata["rotation"] != 0:
-            rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata)
-            rectangles = map(rotate_rectangle_fn, rectangles)
-        bboxes = lmap(lambda x: x.json_xyxy(), rectangles)
-        return {**image_metadata_pair.metadata, "bboxes": bboxes}
+    def analyse_image_metadata_pair(image, metadata):
+        rectangles = analysis_fn(image)
+        rectangles = map(partial(convert_pixel_rect_to_inches_rect, dpi=dpi), rectangles)
+        bboxes = lmap(lambda x: x.json_full(), rectangles)
+        return {**metadata, "bboxes": bboxes} if bboxes else {}

-    return pipeline
+    return analysis_pipeline


 def get_analysis_fn(analysis_type):
@ -52,10 +50,9 @@ def get_analysis_fn(analysis_type):
        raise


-def pixel_rect_to_inches_rect(rect, dpi):
-    def convert_pixel_to_inch(pixel):
+def convert_pixel_rect_to_inches_rect(rect, dpi):
+    def pixel_to_inch(pixel):
        return pixel / dpi * 72

-    bbox = rect.x1, rect.y1, rect.x2, rect.y2
-    bbox_inches = tuple(map(convert_pixel_to_inch, bbox))
+    bbox_inches = tuple(map(pixel_to_inch, rect.xyxy()))
    return Rectangle.from_xyxy(bbox_inches, discrete=False)
--- a/cv_analysis/server/rotate.py
+++ b/cv_analysis/server/rotate.py
@ -1,107 +0,0 @@
-from _operator import itemgetter
-
-import numpy as np
-
-from cv_analysis.utils.structures import Rectangle
-
-
-def rotate_rectangle(rectangle, metadata):
-    width, height, rotation = itemgetter("width", "height", "rotation")(metadata)
-    rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation
-
-    if rotation in [1, 3]:
-        width, height = height, width
-
-    x1, y1, x2, y2 = rectangle.xyxy()
-    matrix = np.vstack([[x1, y1], [x2, y2]]).T
-    new_matrix = rotate_and_shift(matrix, rotation, (width, height))
-
-    x1, x2 = sorted(new_matrix[0, :])
-    y1, y2 = sorted(new_matrix[1, :])
-
-    return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False)
-
-
-def rotate_and_shift(matrix, rotation, size, debug=False):
-    """Rotates a matrix against (!) a specified rotation. That is, the rotation is applied negatively. The matrix is
-    also shifted to ensure it contains points (columns) in quadrant I.
-
-    Procedure:
-        1) Rotate the matrix clockwise according to rotation value
-        2) Shift the matrix back into quadrant I
-        3) Set x_i and y_i to new lower left and upper right corners, since the corner vectors are no longer at these
-            corners due to the rotation
-
-    Args:
-        matrix: matrix to transform
-        rotation: any of  0, 1, 2, or 3, where 1 = 90 degree CLOCKWISE rotation etc.
-        size: the size of the page as a tuple (<width>, <height>)
-        debug: Visualizes the transformations for later re-understanding of the code
-    """
-
-    def shift_to_quadrant_1(matrix):
-
-        # TODO: generalize
-        if rotation == 0:
-            back_shift = np.zeros_like(np.eye(2))
-        elif rotation == 1:
-            back_shift = np.array([[0, 0], [1, 1]]) * size[1]
-        elif rotation == 2:
-            back_shift = np.array([[1, 1], [1, 1]]) * size
-        elif rotation == 3:
-            back_shift = np.array([[1, 1], [0, 0]]) * size[0]
-        else:
-            raise ValueError(f"Unexpected rotation value '{rotation}'. Expected any of 0, 1, 2, or 3.")
-
-        matrix_shifted = matrix + back_shift
-        return matrix_shifted
-
-    # PDF rotations are clockwise, hence subtract the radian value of the rotation from 2 pi
-    radians = (2 * np.pi) - (np.pi * (rotation / 2))
-    matrix_rotated = rotate(matrix, radians)
-    matrix_rotated_and_shifted = shift_to_quadrant_1(matrix_rotated)
-
-    if debug:
-        __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted)
-    return matrix_rotated_and_shifted
-
-
-def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted):
-
-    import matplotlib.pyplot as plt
-    from copy import deepcopy
-
-    m1 = matrix
-    m2 = matrix_rotated
-    m3 = matrix_rotated_and_shifted
-
-    m1, m2, m3 = map(deepcopy, (m1, m2, m3))
-
-    frame = np.eye(2) * size
-    frame_rotated = rotate(frame, radians)
-
-    f1 = frame
-    f2 = frame_rotated
-
-    f1 *= 0.005 * 1
-    f2 *= 0.005 * 1
-    m1 *= 0.005 * 1
-    m2 *= 0.005 * 1
-    m3 *= 0.005 * 1
-
-    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
-    axes = axes.ravel()
-
-    axes[0].quiver([0, 0], [0, 0], f1[0, :], f1[1, :], scale=5, scale_units="inches", color="red")
-    axes[1].quiver([0, 0], [0, 0], f2[0, :], f2[1, :], scale=5, scale_units="inches", color="red")
-    axes[0].quiver([0, 0], [0, 0], m1[0, :], m1[1, :], scale=5, scale_units="inches")
-    axes[1].quiver([0, 0], [0, 0], m2[0, :], m2[1, :], scale=5, scale_units="inches", color="green")
-    axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue")
-
-    plt.show()
-
-
-def rotate(input_matrix, radians):
-    rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
-
-    return np.dot(rotation_matrix, input_matrix)
--- a/cv_analysis/utils/pdf2image.py
+++ b/cv_analysis/utils/pdf2image.py
@ -1,46 +0,0 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import Iterator
-
-import fitz
-import numpy as np
-
-
-@dataclass
-class ImageMetadataPair:
-    image: np.ndarray
-    metadata: dict
-
-
-def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
-    """Streams PDF as pairs of image (matrix) and metadata.
-    Note: If Index is not given or evaluates to None, the whole PDF will be processed.
-    """
-    convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
-    yield from map(convert_fn, stream_pages(pdf, index))
-
-
-def page_to_image_metadata_pair(page: fitz.Page, dpi):
-    metadata = get_page_info(page)
-    pixmap = page.get_pixmap(dpi=dpi)
-    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
-
-    return ImageMetadataPair(array, metadata)
-
-
-def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
-    with fitz.open(stream=pdf) as pdf_handle:
-        if not index:
-            yield from pdf_handle
-        else:
-            for i in index:
-                yield pdf_handle[i]
-
-
-def get_page_info(page):
-    return {
-        "index": page.number,
-        "rotation": page.rotation,
-        "width": page.rect.width,  # rotated page width in inches
-        "height": page.rect.height,  # rotated page height in inches
-    }
--- a/incl/pdf2image
+++ b/incl/pdf2image
@ -0,0 +1 @@
+Subproject commit d1a68b9e580ecbc0cd3050deeedc2d648b377232
--- a/incl/pyinfra
+++ b/incl/pyinfra
@ -1 +1 @@
-Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9
+Subproject commit 0f6512df5423df98d334f5735170cd1f7642998a
--- a/test/unit_tests/pdf2image_test.py
+++ b/test/unit_tests/pdf2image_test.py
@ -1,24 +0,0 @@
-import fitz
-import numpy as np
-import pytest
-
-from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
-
-
-@pytest.fixture
-def pdf(n_pages):
-    doc = fitz.open()
-    for n in range(n_pages):
-        page = doc.new_page()
-        where = fitz.Point(50, 100)
-        page.insert_text(where, "De gustibus non est disputandum.", fontsize=30)
-    return doc.write()
-
-
-@pytest.mark.parametrize("n_pages", [1])
-def test_pdf_to_array_and_metadata(pdf):
-    for image_metadata_pair in pdf_to_image_metadata_pairs(pdf):
-        assert isinstance(image_metadata_pair.image, np.ndarray)
-        assert image_metadata_pair.image.shape == (2339, 1653, 3)  # Height, Width, Color channels
-
-        assert isinstance(image_metadata_pair.metadata, dict)
--- a/test/unit_tests/server_pipeline_test.py
+++ b/test/unit_tests/server_pipeline_test.py
@ -0,0 +1,40 @@
+import fitz
+import numpy as np
+import pytest
+
+from cv_analysis.server.pipeline import make_analysis_pipeline
+from cv_analysis.utils.structures import Rectangle
+
+
+def analysis_fn_mock(image: np.ndarray):
+    bbox = (0, 0, 42, 42)
+    return [Rectangle.from_xyxy(bbox)]
+
+
+@pytest.fixture
+def empty_pdf(n_pages):
+    doc = fitz.open()
+    for n in range(n_pages):
+        doc.new_page()
+    return doc.write()
+
+
+@pytest.fixture
+def expected_formatted_analysis_result(n_pages):
+    return [
+        {
+            "pageNumber": page_number,
+            "rotation": 0,
+            "width": 595.0,
+            "height": 842.0,
+            "bboxes": [{"x1": 0.0, "y1": 0.0, "x2": 15.12, "y2": 15.12, "width": 15.12, "height": 15.12}],
+        }
+        for page_number in range(n_pages)
+    ]
+
+
+@pytest.mark.parametrize("n_pages", [1, 2])
+def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result):
+    analysis_pipeline = make_analysis_pipeline(analysis_fn_mock)
+    results = analysis_pipeline(empty_pdf)
+    assert list(results) == expected_formatted_analysis_result
				`@ -0,0 +1 @@`
				`Subproject commit d1a68b9e580ecbc0cd3050deeedc2d648b377232`