diff --git a/.gitmodules b/.gitmodules index 8ff9112..6f15bbc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "incl/pyinfra"] path = incl/pyinfra url = ssh://git@git.iqser.com:2222/rr/pyinfra.git +[submodule "incl/pdf2image"] + path = incl/pdf2image + url = ssh://git@git.iqser.com:2222/rr/pdf2image.git diff --git a/Dockerfile b/Dockerfile index 349b83d..1f43274 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,13 @@ RUN python3 -m pip install -r requirements.txt COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt RUN python -m pip install -r incl/pyinfra/requirements.txt +COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt +RUN python -m pip install -r incl/pdf2image/requirements.txt + COPY ./incl ./incl + RUN python3 -m pip install -e incl/pyinfra +RUN python3 -m pip install -e incl/pdf2image COPY ./src ./src COPY ./cv_analysis ./cv_analysis diff --git a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh index 7879f5a..fa49438 100755 --- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh +++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh @@ -12,6 +12,9 @@ echo "dev setup for unit test and coverage" pip install -e incl/pyinfra pip install -r incl/pyinfra/requirements.txt +pip install -e incl/pdf2image +pip install -r incl/pdf2image/requirements.txt + pip install -e . pip install -r requirements.txt diff --git a/bamboo-specs/target/classes/buildjob/PlanSpec.class b/bamboo-specs/target/classes/buildjob/PlanSpec.class deleted file mode 100644 index 5d62edd..0000000 Binary files a/bamboo-specs/target/classes/buildjob/PlanSpec.class and /dev/null differ diff --git a/bamboo-specs/target/classes/scripts/create-licence.sh b/bamboo-specs/target/classes/scripts/create-licence.sh deleted file mode 100644 index a9054cd..0000000 --- a/bamboo-specs/target/classes/scripts/create-licence.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -e - -if [[ \"${bamboo_version_tag}\" != \"dev\" ]] -then - ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \ - -f ${bamboo_build_working_directory}/pom.xml \ - versions:set \ - -DnewVersion=${bamboo_version_tag} - - ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \ - -f ${bamboo_build_working_directory}/pom.xml \ - -B clean deploy \ - -e -DdeployAtEnd=true \ - -Dmaven.wagon.http.ssl.insecure=true \ - -Dmaven.wagon.http.ssl.allowall=true \ - -Dmaven.wagon.http.ssl.ignore.validity.dates=true \ - -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases -fi \ No newline at end of file diff --git a/bamboo-specs/target/classes/scripts/docker-build.sh b/bamboo-specs/target/classes/scripts/docker-build.sh deleted file mode 100644 index 42874f6..0000000 --- a/bamboo-specs/target/classes/scripts/docker-build.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -e - -SERVICE_NAME=$1 -SERVICE_NAME_BASE=$2 -# TODO version tag on master push -python3 -m venv build_venv -source build_venv/bin/activate -python3 -m pip install --upgrade pip - -pip install dvc -pip install 'dvc[ssh]' -dvc pull - -echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf -docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} . -docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} . -echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 -docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} diff --git a/bamboo-specs/target/classes/scripts/git-tag.sh b/bamboo-specs/target/classes/scripts/git-tag.sh deleted file mode 100644 index 2005666..0000000 --- a/bamboo-specs/target/classes/scripts/git-tag.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -e - -if [[ "${bamboo_version_tag}" = "dev" ]] -then - echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag -else - echo "gitTag=${bamboo_version_tag}" > git.tag -fi \ No newline at end of file diff --git a/bamboo-specs/target/classes/scripts/sonar-scan.sh b/bamboo-specs/target/classes/scripts/sonar-scan.sh deleted file mode 100644 index fb7a59d..0000000 --- a/bamboo-specs/target/classes/scripts/sonar-scan.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -set -e - -export JAVA_HOME=/usr/bin/sonar-scanner/jre - -python3 -m venv build_venv -source build_venv/bin/activate -python3 -m pip install --upgrade pip - -pip install -e . -pip install -e incl/pyinfra - -pip install -r incl/pyinfra/requirements.txt -pip install -r requirements.txt - -echo "DVC pull step" -dvc pull - -echo "coverage calculation" -coverage run -m pytest test -echo "coverage report generation" -coverage report -m -coverage xml - -SERVICE_NAME=$1 - -echo "dependency-check:aggregate" -mkdir -p reports -dependency-check --enableExperimental -f JSON -f HTML -f XML \ - --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \ - --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**" - -if [[ -z "${bamboo_repository_pr_key}" ]] -then - echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}" - /usr/bin/sonar-scanner/bin/sonar-scanner -X\ - -Dsonar.projectKey=RED_$SERVICE_NAME \ - -Dsonar.sources=src,cv_analysis \ - -Dsonar.host.url=https://sonarqube.iqser.com \ - -Dsonar.login=${bamboo_sonarqube_api_token_secret} \ - -Dsonar.branch.name=${bamboo_planRepository_1_branch} \ - -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \ - -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \ - -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \ - -Dsonar.python.coverage.reportPaths=reports/coverage.xml - -else - echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}" - /usr/bin/sonar-scanner/bin/sonar-scanner \ - -Dsonar.projectKey=RED_$SERVICE_NAME \ - -Dsonar.sources=src,cv_analysis \ - -Dsonar.host.url=https://sonarqube.iqser.com \ - -Dsonar.login=${bamboo_sonarqube_api_token_secret} \ - -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \ - -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \ - -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \ - -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \ - -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \ - -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \ - -Dsonar.python.coverage.reportPaths=reports/coverage.xml -fi diff --git a/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class b/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class deleted file mode 100644 index 1bc1310..0000000 Binary files a/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class and /dev/null differ diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py index 44c346e..16053f6 100644 --- a/cv_analysis/server/pipeline.py +++ b/cv_analysis/server/pipeline.py @@ -1,44 +1,42 @@ from functools import partial -from typing import Callable +from itertools import starmap +from operator import truth +from typing import Callable, Iterator from funcy import lmap from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline from cv_analysis.layout_parsing import parse_layout -from cv_analysis.server.rotate import rotate_rectangle from cv_analysis.table_parsing import parse_tables -from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs from cv_analysis.utils.structures import Rectangle +from pdf2img.conversion import convert_pdf_to_image_and_metadata_stream def make_analysis_pipeline(analysis_fn: Callable, dpi=200): """Make end-to-end pipeline to analyse a PDF with given analysis function. - The pipeline returns a Generator of dicts containing page information and the analysis results. - + The pipeline streams dicts containing page information and the analysis results. + Note: + If there are no results on a page, the page is skipped in result stream Steps: - Convert PDF to pairs of image and page information - Analyse pages, get list of bounding boxes per page (e.g. table cells) - Convert pixel values to inches - Rotate results if page is rotated - Format results to stream of dictionaries with page information and analysis results + Convert PDF to a stream of page as image and metadata (page information) tuples + Analyse pages: + Get list of bounding boxes per page (e.g. table cells) + Convert pixel values to inches + Format results """ - def pipeline(pdf: bytes, index=None): - image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi) - results = map(image_metadata_pair_to_results, image_metadata_pairs) - results_filtered = filter(lambda x: x["bboxes"], results) - return results_filtered + def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]: + image_metadata_stream = convert_pdf_to_image_and_metadata_stream(pdf, index=index, dpi=dpi) + results = starmap(analyse_image_metadata_pair, image_metadata_stream) + yield from filter(truth, results) - def image_metadata_pair_to_results(image_metadata_pair): - rectangles = analysis_fn(image_metadata_pair.image) - rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles) - if image_metadata_pair.metadata["rotation"] != 0: - rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata) - rectangles = map(rotate_rectangle_fn, rectangles) - bboxes = lmap(lambda x: x.json_xyxy(), rectangles) - return {**image_metadata_pair.metadata, "bboxes": bboxes} + def analyse_image_metadata_pair(image, metadata): + rectangles = analysis_fn(image) + rectangles = map(partial(convert_pixel_rect_to_inches_rect, dpi=dpi), rectangles) + bboxes = lmap(lambda x: x.json_full(), rectangles) + return {**metadata, "bboxes": bboxes} if bboxes else {} - return pipeline + return analysis_pipeline def get_analysis_fn(analysis_type): @@ -52,10 +50,9 @@ def get_analysis_fn(analysis_type): raise -def pixel_rect_to_inches_rect(rect, dpi): - def convert_pixel_to_inch(pixel): +def convert_pixel_rect_to_inches_rect(rect, dpi): + def pixel_to_inch(pixel): return pixel / dpi * 72 - bbox = rect.x1, rect.y1, rect.x2, rect.y2 - bbox_inches = tuple(map(convert_pixel_to_inch, bbox)) + bbox_inches = tuple(map(pixel_to_inch, rect.xyxy())) return Rectangle.from_xyxy(bbox_inches, discrete=False) diff --git a/cv_analysis/server/rotate.py b/cv_analysis/server/rotate.py deleted file mode 100644 index ec9a867..0000000 --- a/cv_analysis/server/rotate.py +++ /dev/null @@ -1,107 +0,0 @@ -from _operator import itemgetter - -import numpy as np - -from cv_analysis.utils.structures import Rectangle - - -def rotate_rectangle(rectangle, metadata): - width, height, rotation = itemgetter("width", "height", "rotation")(metadata) - rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation - - if rotation in [1, 3]: - width, height = height, width - - x1, y1, x2, y2 = rectangle.xyxy() - matrix = np.vstack([[x1, y1], [x2, y2]]).T - new_matrix = rotate_and_shift(matrix, rotation, (width, height)) - - x1, x2 = sorted(new_matrix[0, :]) - y1, y2 = sorted(new_matrix[1, :]) - - return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False) - - -def rotate_and_shift(matrix, rotation, size, debug=False): - """Rotates a matrix against (!) a specified rotation. That is, the rotation is applied negatively. The matrix is - also shifted to ensure it contains points (columns) in quadrant I. - - Procedure: - 1) Rotate the matrix clockwise according to rotation value - 2) Shift the matrix back into quadrant I - 3) Set x_i and y_i to new lower left and upper right corners, since the corner vectors are no longer at these - corners due to the rotation - - Args: - matrix: matrix to transform - rotation: any of 0, 1, 2, or 3, where 1 = 90 degree CLOCKWISE rotation etc. - size: the size of the page as a tuple (, ) - debug: Visualizes the transformations for later re-understanding of the code - """ - - def shift_to_quadrant_1(matrix): - - # TODO: generalize - if rotation == 0: - back_shift = np.zeros_like(np.eye(2)) - elif rotation == 1: - back_shift = np.array([[0, 0], [1, 1]]) * size[1] - elif rotation == 2: - back_shift = np.array([[1, 1], [1, 1]]) * size - elif rotation == 3: - back_shift = np.array([[1, 1], [0, 0]]) * size[0] - else: - raise ValueError(f"Unexpected rotation value '{rotation}'. Expected any of 0, 1, 2, or 3.") - - matrix_shifted = matrix + back_shift - return matrix_shifted - - # PDF rotations are clockwise, hence subtract the radian value of the rotation from 2 pi - radians = (2 * np.pi) - (np.pi * (rotation / 2)) - matrix_rotated = rotate(matrix, radians) - matrix_rotated_and_shifted = shift_to_quadrant_1(matrix_rotated) - - if debug: - __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted) - return matrix_rotated_and_shifted - - -def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted): - - import matplotlib.pyplot as plt - from copy import deepcopy - - m1 = matrix - m2 = matrix_rotated - m3 = matrix_rotated_and_shifted - - m1, m2, m3 = map(deepcopy, (m1, m2, m3)) - - frame = np.eye(2) * size - frame_rotated = rotate(frame, radians) - - f1 = frame - f2 = frame_rotated - - f1 *= 0.005 * 1 - f2 *= 0.005 * 1 - m1 *= 0.005 * 1 - m2 *= 0.005 * 1 - m3 *= 0.005 * 1 - - fig, axes = plt.subplots(1, 2, figsize=(8, 4)) - axes = axes.ravel() - - axes[0].quiver([0, 0], [0, 0], f1[0, :], f1[1, :], scale=5, scale_units="inches", color="red") - axes[1].quiver([0, 0], [0, 0], f2[0, :], f2[1, :], scale=5, scale_units="inches", color="red") - axes[0].quiver([0, 0], [0, 0], m1[0, :], m1[1, :], scale=5, scale_units="inches") - axes[1].quiver([0, 0], [0, 0], m2[0, :], m2[1, :], scale=5, scale_units="inches", color="green") - axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue") - - plt.show() - - -def rotate(input_matrix, radians): - rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]]) - - return np.dot(rotation_matrix, input_matrix) diff --git a/cv_analysis/utils/pdf2image.py b/cv_analysis/utils/pdf2image.py deleted file mode 100644 index a26b003..0000000 --- a/cv_analysis/utils/pdf2image.py +++ /dev/null @@ -1,46 +0,0 @@ -from dataclasses import dataclass -from functools import partial -from typing import Iterator - -import fitz -import numpy as np - - -@dataclass -class ImageMetadataPair: - image: np.ndarray - metadata: dict - - -def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]: - """Streams PDF as pairs of image (matrix) and metadata. - Note: If Index is not given or evaluates to None, the whole PDF will be processed. - """ - convert_fn = partial(page_to_image_metadata_pair, dpi=dpi) - yield from map(convert_fn, stream_pages(pdf, index)) - - -def page_to_image_metadata_pair(page: fitz.Page, dpi): - metadata = get_page_info(page) - pixmap = page.get_pixmap(dpi=dpi) - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - - return ImageMetadataPair(array, metadata) - - -def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: - with fitz.open(stream=pdf) as pdf_handle: - if not index: - yield from pdf_handle - else: - for i in index: - yield pdf_handle[i] - - -def get_page_info(page): - return { - "index": page.number, - "rotation": page.rotation, - "width": page.rect.width, # rotated page width in inches - "height": page.rect.height, # rotated page height in inches - } diff --git a/incl/pdf2image b/incl/pdf2image new file mode 160000 index 0000000..d1a68b9 --- /dev/null +++ b/incl/pdf2image @@ -0,0 +1 @@ +Subproject commit d1a68b9e580ecbc0cd3050deeedc2d648b377232 diff --git a/incl/pyinfra b/incl/pyinfra index 6c26528..0f6512d 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9 +Subproject commit 0f6512df5423df98d334f5735170cd1f7642998a diff --git a/test/unit_tests/pdf2image_test.py b/test/unit_tests/pdf2image_test.py deleted file mode 100644 index 4a44a26..0000000 --- a/test/unit_tests/pdf2image_test.py +++ /dev/null @@ -1,24 +0,0 @@ -import fitz -import numpy as np -import pytest - -from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs - - -@pytest.fixture -def pdf(n_pages): - doc = fitz.open() - for n in range(n_pages): - page = doc.new_page() - where = fitz.Point(50, 100) - page.insert_text(where, "De gustibus non est disputandum.", fontsize=30) - return doc.write() - - -@pytest.mark.parametrize("n_pages", [1]) -def test_pdf_to_array_and_metadata(pdf): - for image_metadata_pair in pdf_to_image_metadata_pairs(pdf): - assert isinstance(image_metadata_pair.image, np.ndarray) - assert image_metadata_pair.image.shape == (2339, 1653, 3) # Height, Width, Color channels - - assert isinstance(image_metadata_pair.metadata, dict) diff --git a/test/unit_tests/server_pipeline_test.py b/test/unit_tests/server_pipeline_test.py new file mode 100644 index 0000000..dac8527 --- /dev/null +++ b/test/unit_tests/server_pipeline_test.py @@ -0,0 +1,40 @@ +import fitz +import numpy as np +import pytest + +from cv_analysis.server.pipeline import make_analysis_pipeline +from cv_analysis.utils.structures import Rectangle + + +def analysis_fn_mock(image: np.ndarray): + bbox = (0, 0, 42, 42) + return [Rectangle.from_xyxy(bbox)] + + +@pytest.fixture +def empty_pdf(n_pages): + doc = fitz.open() + for n in range(n_pages): + doc.new_page() + return doc.write() + + +@pytest.fixture +def expected_formatted_analysis_result(n_pages): + return [ + { + "pageNumber": page_number, + "rotation": 0, + "width": 595.0, + "height": 842.0, + "bboxes": [{"x1": 0.0, "y1": 0.0, "x2": 15.12, "y2": 15.12, "width": 15.12, "height": 15.12}], + } + for page_number in range(n_pages) + ] + + +@pytest.mark.parametrize("n_pages", [1, 2]) +def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result): + analysis_pipeline = make_analysis_pipeline(analysis_fn_mock) + results = analysis_pipeline(empty_pdf) + assert list(results) == expected_formatted_analysis_result