From 016abe46de6406038b1b8cdf31401776484ffca3 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Tue, 2 Aug 2022 13:36:50 +0200 Subject: [PATCH] Pull request #23: Add pdf2image module Merge in RR/cv-analysis from add-pdf2image-module to master Squashed commit of the following: commit 13355e2dd006fae9ee05c2d00acbbc8b38fd1e8e Merge: eaf4627 edbda58 Author: Julius Unverfehrt Date: Tue Aug 2 13:35:27 2022 +0200 Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into add-pdf2image-module commit eaf462768787642889d496203034d017c4ec959b Author: Julius Unverfehrt Date: Tue Aug 2 13:26:58 2022 +0200 update build scripts commit d429c713f4e5e74afca81c2354e8125bf389b865 Author: Julius Unverfehrt Date: Tue Aug 2 13:11:07 2022 +0200 purge target commit 349b81c5db724bf70d6f31b58ded2b5414216bfe Author: Julius Unverfehrt Date: Tue Aug 2 13:07:58 2022 +0200 Revert "extinguish target" This reverts commit d2bd4cefde0648d2487839b0344509b984435273. commit d2bd4cefde0648d2487839b0344509b984435273 Author: Julius Unverfehrt Date: Tue Aug 2 12:57:50 2022 +0200 extinguish target commit 5f6cc713db31e3e16c8e7f13a59804c86b5d77d7 Author: Julius Unverfehrt Date: Tue Aug 2 11:58:52 2022 +0200 refactor commit 576019378a39b580b816d9eb7957774f1faf48b9 Author: Julius Unverfehrt Date: Tue Aug 2 11:52:04 2022 +0200 add test for adjustesd server analysis pipeline logic commit bdf0121929d6941cbba565055f37df7970925c79 Author: Julius Unverfehrt Date: Tue Aug 2 11:30:17 2022 +0200 update analysis pipline logic to use imported pdf2image commit f7cef98d5e6d7b95517bbd047dd3e958acebb3d8 Author: Julius Unverfehrt Date: Tue Aug 2 11:04:34 2022 +0200 add pdf2image as git submodule --- .gitmodules | 3 + Dockerfile | 5 + .../src/main/resources/scripts/sonar-scan.sh | 3 + .../target/classes/buildjob/PlanSpec.class | Bin 11496 -> 0 bytes .../target/classes/scripts/create-licence.sh | 19 ---- .../target/classes/scripts/docker-build.sh | 19 ---- .../target/classes/scripts/git-tag.sh | 9 -- .../target/classes/scripts/sonar-scan.sh | 61 ---------- .../test-classes/buildjob/PlanSpecTest.class | Bin 940 -> 0 bytes cv_analysis/server/pipeline.py | 53 ++++----- cv_analysis/server/rotate.py | 107 ------------------ cv_analysis/utils/pdf2image.py | 46 -------- incl/pdf2image | 1 + incl/pyinfra | 2 +- test/unit_tests/pdf2image_test.py | 24 ---- test/unit_tests/server_pipeline_test.py | 40 +++++++ 16 files changed, 78 insertions(+), 314 deletions(-) delete mode 100644 bamboo-specs/target/classes/buildjob/PlanSpec.class delete mode 100644 bamboo-specs/target/classes/scripts/create-licence.sh delete mode 100644 bamboo-specs/target/classes/scripts/docker-build.sh delete mode 100644 bamboo-specs/target/classes/scripts/git-tag.sh delete mode 100644 bamboo-specs/target/classes/scripts/sonar-scan.sh delete mode 100644 bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class delete mode 100644 cv_analysis/server/rotate.py delete mode 100644 cv_analysis/utils/pdf2image.py create mode 160000 incl/pdf2image delete mode 100644 test/unit_tests/pdf2image_test.py create mode 100644 test/unit_tests/server_pipeline_test.py diff --git a/.gitmodules b/.gitmodules index 8ff9112..6f15bbc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "incl/pyinfra"] path = incl/pyinfra url = ssh://git@git.iqser.com:2222/rr/pyinfra.git +[submodule "incl/pdf2image"] + path = incl/pdf2image + url = ssh://git@git.iqser.com:2222/rr/pdf2image.git diff --git a/Dockerfile b/Dockerfile index 349b83d..1f43274 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,13 @@ RUN python3 -m pip install -r requirements.txt COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt RUN python -m pip install -r incl/pyinfra/requirements.txt +COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt +RUN python -m pip install -r incl/pdf2image/requirements.txt + COPY ./incl ./incl + RUN python3 -m pip install -e incl/pyinfra +RUN python3 -m pip install -e incl/pdf2image COPY ./src ./src COPY ./cv_analysis ./cv_analysis diff --git a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh index 7879f5a..fa49438 100755 --- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh +++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh @@ -12,6 +12,9 @@ echo "dev setup for unit test and coverage" pip install -e incl/pyinfra pip install -r incl/pyinfra/requirements.txt +pip install -e incl/pdf2image +pip install -r incl/pdf2image/requirements.txt + pip install -e . pip install -r requirements.txt diff --git a/bamboo-specs/target/classes/buildjob/PlanSpec.class b/bamboo-specs/target/classes/buildjob/PlanSpec.class deleted file mode 100644 index 5d62edd316e2bcf4b1b357ff21a5462c31361775..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11496 zcmd5?d3+qzegFP;W$$_!j9Hn(9LeGUHW=;72gX>&SeK1JmW@`H5zvUyY9x)knh`s* z!U~w%oHjS%4EK?QkQgPb8&ra6OIqT#fwZORMbaic+_Y)aHZAFq_WNd5yINZ;yLSI5 zAETXl^WN|M?(4np{PK-&0azg~g%A)N+m|)d{lj)&Vsl!zlA}f{gc`xTVSP+baAYvC zsc+awxdJWO+S%RK+}hjGxUp3bZ7)tqx*5|NY!DpXY+H`2TW+_W&KhCN5ge5otJ5t# zJ?@wex0?>6x2ZAN8ivO0rcJ8r(;b8Breet*4B} zbXu@#@u4dZ9hnh)jNp6cK(J85(U>hbrrorRj_gREk?GR=(j+$8Zm0Bgx1KT8dwy{C z8js=#f>@;C3>-tM-66%r+V(@FM&{Ry=%yt&e(@DEAV!u&TDPZ+QP-r1LWmPeQGkUn zA=C@bN!cR_-A(I`W9n9-Pao;C?Swx}&QTiL@eP z$adV-tLj&s8$kjq6z1o0#ti&Bt2;ujEK+|~X*eINIW3hjbk}IHQ`?M8lTuBkmV$FB zxiVPkqh`WmiIH&<6q@B}bAv))jfM+AwF!-8`_iT}BLl`3rWf{XU^h|0<5n*c>RYc4i%-Gq{X;g$k<3Fm!7&FrL=m`Bm7>vPz zOB>ayF%8#XyWoVhJveCew^>^bX2ER?0!OR^ba6)n!=d-N6ra;@Ev^%stXsAraFd{+Vqk>e`Lv>6)Nl)K_5C?x zhf1L?F~+M4WPX_CGV<|8+@|4n1u?=1pzMi*PHJZ*hqV+B>jI z!*1L|4IM4Wf`)fn;kqr-VwC zjrcl29jiw1{Y8nqLQ&&CFF3DaB`sYY!NV$eJjRT=vy~q8VLYMXNqm9+-qJ^yly02C zjaDI_(%p#b(^aHJ1$~R)DXdV)J*VM$yg+2PDKoMv*R%qtYTVgD$g3>sB@GYaA(i5N zQJLJA1UFTVL|)tl$sL-ZSFBUh=;;pQ6~U=~wjA>k;MjnjiA}4f;tJtcHGCCcqqX)M zPAX%1HGp8lk;!+;2%B+h+q#^gpd=|R|V~2yDyAi zWfJ7W@?HlccpZ~UP`@U)Xa-6Ti{Bx9s|qD{b!S_mi+@7cS3WX%qEoetx1>33=+;&{ zvyDY{itH2MxVFv+Zx`* zcj&kbOO>eof|ZLeuiP6HfraopEceRDx0Dy%DW`eJNNuyTuFrw}iUZ#joH8{W>o*4U zY}$>nT(=!l@rqp7k9R`&9^-ArJ59lBbB&P*-dA<$`}ji*AK;JJx5#hOS=d8xetFMb zO2>5QWaxhUv4%gvpE5y6<;hd~GKRY4Gq*BO%Sxkk-9cPsV*2NTGp2BijgFhvKt?A= z{Y-T_Ys2^f>#@$xSYm3L;?rMf_)GlA@0sLr z?#MP8S*&g}c6}o5I71O5HDt#Yo_=kg*9(dJKfQEomfOqkI~K;Si>Xf5$`!G?{@B{J z`E?1izc=IP`Q@rr5*zajJZ@giBtA|W7RCybeSGqhWBgCmIs~g4GIicwoP6-?)X<3C zA6r>pKec6^Dj)GF1W4IGYxo!ZD}`oSY_1tic0VI)+mQ*a$Xbf*-&mqFsU}V=AEJnf z%O(}+oP&?>A0hmLWn~$$kq{&8JH;YkW6a({13Z#2&=}H+DLF+*`QxB^eXJ2NO?n8*s?t@FMq!w=VVd|)%Z)$SnX^k zWjG1PQ*S5H?{(4YJkyU;WpO}iG?}GDt__%JV|~UR*-YyRONjn-TGB8*f$-X~YUvzJ zB5G-vjhd{gNEu;8)8uHG%hFvrczUm)^4_bY$5M<5Wtma3#>>tb;2T0RuY5=?S>|&h z;cH>DXXM$8C(wu-LkA$WY${pC_N-Gh!`0{3*Dpgp6X(Mn#IPiqpB)7ZQ7=aukRu)VCoRLX(Mku9i5SU$s?Flk$QCZ=*iIfIf) zZt7_449g;7+nKdu-q)+MeMZbl=~g@}XE9}{)V>@Ay3SjutZR=f(c~PdqaPZ!`)CYi z6~<7&x=KR5#F_e$U?$OSrVJ}(_{nrwmNL8EV7jp`ebA?WIaOpsTUS@(hOn?DKi7B7 zgPg*2oV1_EG`>>WxvEaBa)v^3zFowV+cA2B9VP}*FL35-{ zE(yuSg4JcnofZNPqBWUf;L@aBHc}DWH#Il5lW`Tss+mAFC}wQZWV2jKfR2J*MSPkO z;t{ehB*|*>Fs7kYPG1E&l?w2bDq2XX2IG`Dzwd}hC*I*7>CxnJVWT)iMdJYFBK4~xx|Fk!eVXW^8DW4JD>TqnMLBpsHk1*avlPA1_D=@}zG zC1VVfOk+bs98Fxw(w#Zf1cR=2xLQ~1`^~aqE(uK@J2$g;K^e1MJK&)C#18UOQV=I$7H0D;#J2!=CxY zk8kIBtAP`u;T(>Qo)BG-!^zQ4dp}>tX|H*}KI1Lp9Um;h>D~yqoY!jdKoa~_UuT`Y z7iS*4WTCf2^Cw_2&f>&5I2%idoOm)*prh9gTSk?4so-OEb{laQXhSee6l z8<(7uL&Mhm0G}`1i`I@Mb!xD<ejq0orgeH;Ry+ zOVCaPj>9R~hz_E3HW|1Hn~6$5UH#xUvPtF>u3~d;5|%Cc1lq6SyqdQ!{}p zlRdL0(3|X;J%PStPiO*0GKZmPIyA#HPT({(!HQn}NfVq;m{617;7Wqa<}ki-5<7a9 zA?5)nDV|BzE1`$53US^W+Jx0SMQ^|(Si{qz3-BzzJ%_a(uVOyM zd6AKJ+>86jkLNHS_u~OV45(urpF@PC1PLsd#KYv!Bhg25czhC1^+X@ti)V6pwunU3 z6G9V_XoiLsoQS4rIR4%;AU8ALsDb{^Z${IKb6E ziT-^KKi!7|(SOY0XH<}>;h+1%6ZrQW{<~HrC&5~oos)1fCr3@nF+F?b*jkC^WPYuj z$bUIGDJP%uP<_3pRu(GCYUT9*hosfYnWeNXCT;XGpF~5|lJ&$;D=SKoT1ljoNM#i{y}DK!a&o~wSyd}*bF!{hnsd@RAs6MOgEl)U zot)HFD_e51b)R&450~YHrZ-Jiy_D5-5$Qh8DIe!xP7G>h^>`1J;ytYJp1y6`QxU6H zMyVntx}q4GlMIzP+C$|UlpFud1<)xq9$jU<)lP7VFyugbzuV9CKgI8VN zzzwnwH_Eqpf9PG@EbrkK`G6Nhe!!b6KSGBbz)txEZV%Mr&cI2yE3g=M2hPW?z&h*> zwBeq>Cfpmi4EF_kaerV44+Jbc7;y1WUiMf;)%e^ zcr5TLo(#N=F9i1E>A?5!OyGxjHt;h%7x)Oz2L&$#LwGSb7f%I`#mm7H@x|b$@TK4) zd^uQ;mx3#BTW}RN1{?88umfKSUXE9T2EH0h<7+`1dxB$lEqE<|ug662CXU}qS*Td; n1>WG!yi*NQW{d^$v^>LKrU&vYquX> pip.conf -docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} . -docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} . -echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 -docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} diff --git a/bamboo-specs/target/classes/scripts/git-tag.sh b/bamboo-specs/target/classes/scripts/git-tag.sh deleted file mode 100644 index 2005666..0000000 --- a/bamboo-specs/target/classes/scripts/git-tag.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -e - -if [[ "${bamboo_version_tag}" = "dev" ]] -then - echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag -else - echo "gitTag=${bamboo_version_tag}" > git.tag -fi \ No newline at end of file diff --git a/bamboo-specs/target/classes/scripts/sonar-scan.sh b/bamboo-specs/target/classes/scripts/sonar-scan.sh deleted file mode 100644 index fb7a59d..0000000 --- a/bamboo-specs/target/classes/scripts/sonar-scan.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -set -e - -export JAVA_HOME=/usr/bin/sonar-scanner/jre - -python3 -m venv build_venv -source build_venv/bin/activate -python3 -m pip install --upgrade pip - -pip install -e . -pip install -e incl/pyinfra - -pip install -r incl/pyinfra/requirements.txt -pip install -r requirements.txt - -echo "DVC pull step" -dvc pull - -echo "coverage calculation" -coverage run -m pytest test -echo "coverage report generation" -coverage report -m -coverage xml - -SERVICE_NAME=$1 - -echo "dependency-check:aggregate" -mkdir -p reports -dependency-check --enableExperimental -f JSON -f HTML -f XML \ - --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \ - --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**" - -if [[ -z "${bamboo_repository_pr_key}" ]] -then - echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}" - /usr/bin/sonar-scanner/bin/sonar-scanner -X\ - -Dsonar.projectKey=RED_$SERVICE_NAME \ - -Dsonar.sources=src,cv_analysis \ - -Dsonar.host.url=https://sonarqube.iqser.com \ - -Dsonar.login=${bamboo_sonarqube_api_token_secret} \ - -Dsonar.branch.name=${bamboo_planRepository_1_branch} \ - -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \ - -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \ - -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \ - -Dsonar.python.coverage.reportPaths=reports/coverage.xml - -else - echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}" - /usr/bin/sonar-scanner/bin/sonar-scanner \ - -Dsonar.projectKey=RED_$SERVICE_NAME \ - -Dsonar.sources=src,cv_analysis \ - -Dsonar.host.url=https://sonarqube.iqser.com \ - -Dsonar.login=${bamboo_sonarqube_api_token_secret} \ - -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \ - -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \ - -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \ - -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \ - -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \ - -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \ - -Dsonar.python.coverage.reportPaths=reports/coverage.xml -fi diff --git a/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class b/bamboo-specs/target/test-classes/buildjob/PlanSpecTest.class deleted file mode 100644 index 1bc1310875c92a2f1aa7d0cc42ec2920bc1aa74e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 940 zcma)4+iuf95IvJPHAzD*B`uU&DHkaPiG2W`QXz3EyaZa&DyWZZXWMRSdo6p7fWP8} zB9Qn1J_<4Gmgb?L^krsecjnBQ+4Jl7_n!bBreUy)EKEWjiv66k%a# zLK%%@rd@n;>DdFF6D|+`*?d6gg$F7;+%-k!SZ(g@MT|>mY%h(8NO8 zNanegi3nsI7$b5XA{R2%LY=0It<0nJ#NX_>=?IU`O<}qX7Q;5nBUG=nlUdho_OE&$I^kj!EK^oGP~j8HldaE@U{CC8 z_m6PlOZnVj>z;u+=Giun6WCwmh|uEKD%yPZIO<@5PcHuwdk6W3TU6M0f1tN}gr%=6 o?FZj*?fn$KRdQ)Ee2W3vWt*krqLViB47rNi3`@AfU=`f`19cevQvd(} diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py index 44c346e..16053f6 100644 --- a/cv_analysis/server/pipeline.py +++ b/cv_analysis/server/pipeline.py @@ -1,44 +1,42 @@ from functools import partial -from typing import Callable +from itertools import starmap +from operator import truth +from typing import Callable, Iterator from funcy import lmap from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline from cv_analysis.layout_parsing import parse_layout -from cv_analysis.server.rotate import rotate_rectangle from cv_analysis.table_parsing import parse_tables -from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs from cv_analysis.utils.structures import Rectangle +from pdf2img.conversion import convert_pdf_to_image_and_metadata_stream def make_analysis_pipeline(analysis_fn: Callable, dpi=200): """Make end-to-end pipeline to analyse a PDF with given analysis function. - The pipeline returns a Generator of dicts containing page information and the analysis results. - + The pipeline streams dicts containing page information and the analysis results. + Note: + If there are no results on a page, the page is skipped in result stream Steps: - Convert PDF to pairs of image and page information - Analyse pages, get list of bounding boxes per page (e.g. table cells) - Convert pixel values to inches - Rotate results if page is rotated - Format results to stream of dictionaries with page information and analysis results + Convert PDF to a stream of page as image and metadata (page information) tuples + Analyse pages: + Get list of bounding boxes per page (e.g. table cells) + Convert pixel values to inches + Format results """ - def pipeline(pdf: bytes, index=None): - image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi) - results = map(image_metadata_pair_to_results, image_metadata_pairs) - results_filtered = filter(lambda x: x["bboxes"], results) - return results_filtered + def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]: + image_metadata_stream = convert_pdf_to_image_and_metadata_stream(pdf, index=index, dpi=dpi) + results = starmap(analyse_image_metadata_pair, image_metadata_stream) + yield from filter(truth, results) - def image_metadata_pair_to_results(image_metadata_pair): - rectangles = analysis_fn(image_metadata_pair.image) - rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles) - if image_metadata_pair.metadata["rotation"] != 0: - rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata) - rectangles = map(rotate_rectangle_fn, rectangles) - bboxes = lmap(lambda x: x.json_xyxy(), rectangles) - return {**image_metadata_pair.metadata, "bboxes": bboxes} + def analyse_image_metadata_pair(image, metadata): + rectangles = analysis_fn(image) + rectangles = map(partial(convert_pixel_rect_to_inches_rect, dpi=dpi), rectangles) + bboxes = lmap(lambda x: x.json_full(), rectangles) + return {**metadata, "bboxes": bboxes} if bboxes else {} - return pipeline + return analysis_pipeline def get_analysis_fn(analysis_type): @@ -52,10 +50,9 @@ def get_analysis_fn(analysis_type): raise -def pixel_rect_to_inches_rect(rect, dpi): - def convert_pixel_to_inch(pixel): +def convert_pixel_rect_to_inches_rect(rect, dpi): + def pixel_to_inch(pixel): return pixel / dpi * 72 - bbox = rect.x1, rect.y1, rect.x2, rect.y2 - bbox_inches = tuple(map(convert_pixel_to_inch, bbox)) + bbox_inches = tuple(map(pixel_to_inch, rect.xyxy())) return Rectangle.from_xyxy(bbox_inches, discrete=False) diff --git a/cv_analysis/server/rotate.py b/cv_analysis/server/rotate.py deleted file mode 100644 index ec9a867..0000000 --- a/cv_analysis/server/rotate.py +++ /dev/null @@ -1,107 +0,0 @@ -from _operator import itemgetter - -import numpy as np - -from cv_analysis.utils.structures import Rectangle - - -def rotate_rectangle(rectangle, metadata): - width, height, rotation = itemgetter("width", "height", "rotation")(metadata) - rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation - - if rotation in [1, 3]: - width, height = height, width - - x1, y1, x2, y2 = rectangle.xyxy() - matrix = np.vstack([[x1, y1], [x2, y2]]).T - new_matrix = rotate_and_shift(matrix, rotation, (width, height)) - - x1, x2 = sorted(new_matrix[0, :]) - y1, y2 = sorted(new_matrix[1, :]) - - return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False) - - -def rotate_and_shift(matrix, rotation, size, debug=False): - """Rotates a matrix against (!) a specified rotation. That is, the rotation is applied negatively. The matrix is - also shifted to ensure it contains points (columns) in quadrant I. - - Procedure: - 1) Rotate the matrix clockwise according to rotation value - 2) Shift the matrix back into quadrant I - 3) Set x_i and y_i to new lower left and upper right corners, since the corner vectors are no longer at these - corners due to the rotation - - Args: - matrix: matrix to transform - rotation: any of 0, 1, 2, or 3, where 1 = 90 degree CLOCKWISE rotation etc. - size: the size of the page as a tuple (, ) - debug: Visualizes the transformations for later re-understanding of the code - """ - - def shift_to_quadrant_1(matrix): - - # TODO: generalize - if rotation == 0: - back_shift = np.zeros_like(np.eye(2)) - elif rotation == 1: - back_shift = np.array([[0, 0], [1, 1]]) * size[1] - elif rotation == 2: - back_shift = np.array([[1, 1], [1, 1]]) * size - elif rotation == 3: - back_shift = np.array([[1, 1], [0, 0]]) * size[0] - else: - raise ValueError(f"Unexpected rotation value '{rotation}'. Expected any of 0, 1, 2, or 3.") - - matrix_shifted = matrix + back_shift - return matrix_shifted - - # PDF rotations are clockwise, hence subtract the radian value of the rotation from 2 pi - radians = (2 * np.pi) - (np.pi * (rotation / 2)) - matrix_rotated = rotate(matrix, radians) - matrix_rotated_and_shifted = shift_to_quadrant_1(matrix_rotated) - - if debug: - __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted) - return matrix_rotated_and_shifted - - -def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_shifted): - - import matplotlib.pyplot as plt - from copy import deepcopy - - m1 = matrix - m2 = matrix_rotated - m3 = matrix_rotated_and_shifted - - m1, m2, m3 = map(deepcopy, (m1, m2, m3)) - - frame = np.eye(2) * size - frame_rotated = rotate(frame, radians) - - f1 = frame - f2 = frame_rotated - - f1 *= 0.005 * 1 - f2 *= 0.005 * 1 - m1 *= 0.005 * 1 - m2 *= 0.005 * 1 - m3 *= 0.005 * 1 - - fig, axes = plt.subplots(1, 2, figsize=(8, 4)) - axes = axes.ravel() - - axes[0].quiver([0, 0], [0, 0], f1[0, :], f1[1, :], scale=5, scale_units="inches", color="red") - axes[1].quiver([0, 0], [0, 0], f2[0, :], f2[1, :], scale=5, scale_units="inches", color="red") - axes[0].quiver([0, 0], [0, 0], m1[0, :], m1[1, :], scale=5, scale_units="inches") - axes[1].quiver([0, 0], [0, 0], m2[0, :], m2[1, :], scale=5, scale_units="inches", color="green") - axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue") - - plt.show() - - -def rotate(input_matrix, radians): - rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]]) - - return np.dot(rotation_matrix, input_matrix) diff --git a/cv_analysis/utils/pdf2image.py b/cv_analysis/utils/pdf2image.py deleted file mode 100644 index a26b003..0000000 --- a/cv_analysis/utils/pdf2image.py +++ /dev/null @@ -1,46 +0,0 @@ -from dataclasses import dataclass -from functools import partial -from typing import Iterator - -import fitz -import numpy as np - - -@dataclass -class ImageMetadataPair: - image: np.ndarray - metadata: dict - - -def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]: - """Streams PDF as pairs of image (matrix) and metadata. - Note: If Index is not given or evaluates to None, the whole PDF will be processed. - """ - convert_fn = partial(page_to_image_metadata_pair, dpi=dpi) - yield from map(convert_fn, stream_pages(pdf, index)) - - -def page_to_image_metadata_pair(page: fitz.Page, dpi): - metadata = get_page_info(page) - pixmap = page.get_pixmap(dpi=dpi) - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - - return ImageMetadataPair(array, metadata) - - -def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: - with fitz.open(stream=pdf) as pdf_handle: - if not index: - yield from pdf_handle - else: - for i in index: - yield pdf_handle[i] - - -def get_page_info(page): - return { - "index": page.number, - "rotation": page.rotation, - "width": page.rect.width, # rotated page width in inches - "height": page.rect.height, # rotated page height in inches - } diff --git a/incl/pdf2image b/incl/pdf2image new file mode 160000 index 0000000..d1a68b9 --- /dev/null +++ b/incl/pdf2image @@ -0,0 +1 @@ +Subproject commit d1a68b9e580ecbc0cd3050deeedc2d648b377232 diff --git a/incl/pyinfra b/incl/pyinfra index 6c26528..0f6512d 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9 +Subproject commit 0f6512df5423df98d334f5735170cd1f7642998a diff --git a/test/unit_tests/pdf2image_test.py b/test/unit_tests/pdf2image_test.py deleted file mode 100644 index 4a44a26..0000000 --- a/test/unit_tests/pdf2image_test.py +++ /dev/null @@ -1,24 +0,0 @@ -import fitz -import numpy as np -import pytest - -from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs - - -@pytest.fixture -def pdf(n_pages): - doc = fitz.open() - for n in range(n_pages): - page = doc.new_page() - where = fitz.Point(50, 100) - page.insert_text(where, "De gustibus non est disputandum.", fontsize=30) - return doc.write() - - -@pytest.mark.parametrize("n_pages", [1]) -def test_pdf_to_array_and_metadata(pdf): - for image_metadata_pair in pdf_to_image_metadata_pairs(pdf): - assert isinstance(image_metadata_pair.image, np.ndarray) - assert image_metadata_pair.image.shape == (2339, 1653, 3) # Height, Width, Color channels - - assert isinstance(image_metadata_pair.metadata, dict) diff --git a/test/unit_tests/server_pipeline_test.py b/test/unit_tests/server_pipeline_test.py new file mode 100644 index 0000000..dac8527 --- /dev/null +++ b/test/unit_tests/server_pipeline_test.py @@ -0,0 +1,40 @@ +import fitz +import numpy as np +import pytest + +from cv_analysis.server.pipeline import make_analysis_pipeline +from cv_analysis.utils.structures import Rectangle + + +def analysis_fn_mock(image: np.ndarray): + bbox = (0, 0, 42, 42) + return [Rectangle.from_xyxy(bbox)] + + +@pytest.fixture +def empty_pdf(n_pages): + doc = fitz.open() + for n in range(n_pages): + doc.new_page() + return doc.write() + + +@pytest.fixture +def expected_formatted_analysis_result(n_pages): + return [ + { + "pageNumber": page_number, + "rotation": 0, + "width": 595.0, + "height": 842.0, + "bboxes": [{"x1": 0.0, "y1": 0.0, "x2": 15.12, "y2": 15.12, "width": 15.12, "height": 15.12}], + } + for page_number in range(n_pages) + ] + + +@pytest.mark.parametrize("n_pages", [1, 2]) +def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result): + analysis_pipeline = make_analysis_pipeline(analysis_fn_mock) + results = analysis_pipeline(empty_pdf) + assert list(results) == expected_formatted_analysis_result