From df0bbc92c7ced46573ae2ef20dbed6ba9ae96b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Wed, 24 Apr 2024 11:38:26 +0200 Subject: [PATCH] RED-8932 Fixed not merged headline with identifier --- .../DocuMineBlockificationService.java | 4 ++-- .../SinglePages/DocumineIdentifierProblem.pdf | Bin 0 -> 9424 bytes 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index 6f8fd3b..ca72723 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -61,8 +61,8 @@ public class DocuMineBlockificationService { boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle() - .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle() + .contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString()); boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DocumineIdentifierProblem.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a13ba299f2447582ccf468b2f8ff61a15bc2e28a GIT binary patch literal 9424 zcmcIK2|QHY+bXioYpfN;H4y`BgYQ_Cs6 z^%N$C22cRQeJ4yzR~KnZ@umg=7_h}2K$61)s7MlpMe$>JBOPhfP%6`wN%f*KsdNu& z5D0Bd3kvX~gc~zF$TXH86<{%gsruj_OpHqR+ zB%Epa$6@)$F{u<5jX^i2u&5fw8!#vg4u!+u&}b|ck6Dc*LEQ|d21phy*nc(BmdWr8 z_Mpx+@S}w${k|MJi%Mq&0eH?zKs<~9BSx4j_=W-qXbeEq#<(HPn2g{6kW`Wb(wG`T z^Pt+B86h2zWG00k6hMJE5zZxn1E{Z$w5EE}DD&~yLa;0zXYHv$j9@00i6&tzGY1w( z8#s#@9w2a6W2OxdXP+EUfEMQw#BR!4i6 zx+hR#*Z3_p%eRJI0pAwx$-ejUk^i+!q|3y+jP{K3EaVfDNMa?vvddd}e0(s_>0YG9 zyKuV}`pBo2Pg}O$?|ew^iWFxsE~}qXSBZm*i<+-m(C0pyT4R-er?d0Uz;0N97pAfb zX1J-sZF5Q5K1|HYPs{`wSn?O=`iFkQVySD0lA4{Qwnwz95L^68Q7)wwOMZRK9O z9eZ^=I?$SKAI{icRz3Xo;Ix93+M1ZeL8}*aHqq{rgAsPZC#U6ri52vyu~uAC#r_q0 z5T=3qP~6+kMByQ?YsOp8)qYSJJQ2SQjfS+z#CsJQE)& z*j@0Ke6Olt=~$y}%o<|C@uSu7L$B?o&LLhTOe_nn+t)u%gzGDJol-}gkTp;~v`vbQd5;;o>2 z1at8<59Q8?$hZu|@myUAHQ(kfJ(OA*W_sXXx~pHt3q2N5N)1jdwn5b7?NDm)lQQ<} zlHbv{LqW>T|Fp?F%=TuJcBMgsrXAMelAksfFP4vX&ut!`pu4_>TgiN}E%&|nKHAIw zjb*Tf`DLux21d5Mg{0JwvoC;jk-hOUeT$UF;=${CvvP*>%dd>-KK5H==`h*6Tup&E zzQ%JX;*5Ltwbf?{gV%9S8b7461*NbV?xqGxRan3Lu=GD1@7(S2su^qx^G!E7y;VqA zimgzSK(tqL*|vMqPD8A#&1EK1b_5C(k%7NhH` z?EkV(86ahpiAblDd~3-6gNCjh+pQ*N(~_*V0WIa)f(C9eJ#%&iniRuofSAgd|$de*K}mhtwS5$*b(6S#t@}t zW<~E}L#&lA*%dH5ow765IG>DoaX0;aR;d-fs}@tF=&55COfXp#tp0J~*5Z#|4Y+#5 z{lCg|WxuW&&pdcUMhbWbrN2>~3(y3*&?{nOq znzXEPZrSWnzE)B&DYBmErb^hPnLOAZ8<9wkIS^6jBHM9>R(Kk7_VJ_EB$xGW!K+&r znJycYlh~+oUBd0{OQ~F3YJ92wo9e0~7RoAnLnq}-NQpXM+~A!bSmoi1BxOq-UgT}w z?cdgG@c0sPQ+l*lZf;W>Y0=Gy;tv(>U4jl$A(O|IayIH6E?YO_>}^Uhs}9|dk$D~Y zvh&pym87_hby@KjPabwI(hZwL?GMj{rANeMWjAMdiLOg-y;6kg%xZTEZ>u7qg*(3{ zNZjte-S0PUb=sJsDwYHtwp5lxpJ;%CGklE^;)ru;~J~@?W7$jpf*#o(BUTRgu>yV$fgmU(Y(Z0 zGUBqnG?6DsI^&nKN{{>M!T-3m)qcdL)kCRyXMBHse5~CW2f_VUcIRLFApc-8xxg)G zqPO8vSv&qbY-*o=dq*13)mCG9slNYj>m#`v+VAEEoM@`Jn&sI{>Peimicn0w$T({p z@Y%cePDQWDIk{9au_~-rVt~1;yY_}gYl7C{rdJ^&W6SHKC(C@kWRx#MODB0;n$A8v zP#4y_w6H1ec7Fecj@v6ty1GAq2KKlsq?HQid|6#F`f@qOyy8)8Pp5E@R3ljrmv$_| zA#2eC()GUQiQcWw7m~YPxd)AojIq3c&^t< zly@7m&{_2vhRq2(Dv|C3uSTlp*+dH+%-Nu{^AUB()}KxX%-&qS14;&eU`I6p`dVRd2?)DUj0f(pB)j;mtFe1E=Am$q>IyUEYC*!u2PH?kHU-J7Ea)^To@X!HalCb3QQOOXEfcT^xYxUYC-Zg8j zkJT3O=oQVw*cBdDGfxn)X?2KD_!2&DZ*l%v^7RQ}?>)Pt;xekY>c{k5wd#TmY=+fO zO>Jhsc{BL#$~KRy@wuXM=GUq-S1GR5{6=ZbOMmv@uI5#WiCz3z>B+>zsoNEzPR)m6 zts|zFiLWn>UKODi9;Kx6Soo@5W3F<@SOjfU>T!$(>GIf)*5S!R=37ojO_swAlvl{~ z5K3+o>iQ@(#(v(#usgItU*O}a0~5`JkE^gJ{)tGC$z@L~$^9WUG@zuF`OYi@vCk{2 z>&mBmB6cvogsHyE+P8ca{wPfbSfInJSBs^;25?Ty#^nq`fOhuDmcE)x45W#~gtdrX-52V+> zc$p*jFgjUv`00Ce`*HE;oI1tOmPIKB*OEV%p-))hpH3gQI-qn!8^umRw=Wwy=$5gQ zqPiycuD#bI>Zzm2uP-#*!1bgIW$Q7em6XQA$B37xuY*p!iQ2Bx@G)3r$%T=|Yo|tS zaFJ*h9n$Tc$rgZBi**)lDO%l8Hod?*#p&bK zj3uk(-M_jo(uliKDYvA%%mgM~pkTdWXIDz5ovcUMv>4XJ?u*FHiI7C8>PgM*p97cm zI^=weQM6$X_gi=eHvZ|vgf}-D>{H)#{<57Tx#%C@NWS0mGTZ|tyPN7^C%Z)w&jxM3 zEj}XBp!3amOoY;(0>}xS@;%?Kyynu4q27v;%6kt_IlG_NElEqW$hOqGPV;Q3_4nQM z=ZHbM^r1%IwcZ2on`TUoc_>@1dsujG*U^A4ZcznaYxauauX6G%UR2FXxu67s7m*uT zBR4ac{uDnX$q>xEJSYH~KX`|N{eKAxUo(SgaH>ELySR@|`6To6P`HwL*$b(4>0CPS7 z4drTJrfNl*1wm^OkPgA_EGShWgV`u&ZJZy@v`#cnmQN6XhEiU8fS(l{OuS_p26;fq znwB;iX&B+6lHWthWxnS!sXabbnZm`70{<^VVh(wY*+T@i+W z#vvV8RDVZsaop?-!ttFwNaxwrV(_RRXr0UWT>-EQKxBIn{fgL{F;6FBYAJ7;DsF!_bxz& zoa^JkLF1uks2_rZE^Qoc?ip(5&c>q)2lhg%<-hajgW7p3T$iAC=4CdbmAFr^*ryl&C@|PwbL*x@a0+Glse7`aI z_&Jjg3LiLxPxyYrg?}Z6KiGXdeohQA|0^+^^Y`EF{`bx1*th@6=5xKocZ}byJrX!vK1=*bB)DfPp|k^ZT3nuF-6nmwPjJ11q__h{zO-1-Sw;$>JM}#bRui z3cXU-8V+c58PYSbmr9IvIx2r@YZDmfA!meo_pKsO6+!qDLjVvA!2#gt7B;%U- zKvwu(;_df6H+56}JDkXNMVF-ARSz;6vRc<(`rwo}W7Jy&(6a1g**bY6zl>x#RP+3o8D!8og!4NPP7@Er!I~1}3kJM&DinnN5b|(% z#)C6O1E5s>9078gcp!78i{mk7_1=?0XF)pvo?c+MF3|0J*lO+xhKw{;__|GV?1th_ zFfe0mq0^vZIDnZoU32m150vn`!k?xmnHxKV2Z52EIo*o^R_$$=o>V3n4r=fwT@A{O zHx0}L!Zi%RgBR**a18+gepG)bj0G9-bKH#Ky5hN&f)hZ5%`O-Uo}pKy6L?Ic?aDFn z0FJ=%U2XtQi`xPU)^}UVLC9m?tSu#CiQgSL9(?&8K7%U7XG;m#Sx1iJOgkGrW6(rk z9To+iXrRFp3h;-(wWX#Mf0`diClto4<_rMUj0#1kT(y8!&Iz%(TL-<)hSi{&{}fiw z#nNarjyUVuF$Aq&8Py-C<@a*GS8^QTdose$VG_UExgC08=@7WQvF5_y)9MZirRa=b zkjW9iL0HJ`k(;dJlI{m4acM}M2aEcbJ1Z^=Dek!Pq_pY8Dhy&*j?j5W>4&w~MUZ01 z>wU@}uMfQ*S5$ZQL7EaJb#*_6uh&-yuR0f=v`O_$?vY&o+)L39VqbV6jtp4(TiMBx zRX*gXMGR*Y^!<4so@2UmLFDYZz+}xeVNa}c!pN3#qsC=j*n)^t z+B3al{n%G~7_Wq~y_FlhEdNg1w`e_c=WW&6=vp z_A1dWYwx&yJaF`J4vl8hg+y8v>nz3-R6m5H+*$1guVGx0#WrwzGGgvejLQ4sn%&T>7!qul}(m8Wq26 zZ)aQ;7(G~(u@D(>_87t3;rC;q00NTFW1J4rj==envztkhv3Q z`D-4|MK7QQG@`MBY^h8V!#@Bt_fW2mBr*ILOosr92Ng6eyp!5r@S2wsYaz|S17KzV z13In*yeNJ_-;XOny@h-Ucy7tHDLeu%rN5NB)f)yt!b3fPg>@z%Hb3 z+6X?@7A}xnDzMCBA@E7E3~1f>Lfr`StU`oaNGM}Q>T5D+Kw z?KMVmV3QX>TH2r`D#f~;<03y`oVG~NaDUf+k;iP*7UAeE*g7)XyWaQ9D%|5-k;l$^ zzQQiM2;AFQ?@L&TSo$ho@^vuL`3?h~qI_?U)w2N`iGWbHy5E=3xIVM6h@{KAg#Rox zIGeY=p6C-(9_9?8a2^Swb94K(jP&`j@g527ayq69XeB zZSZ&Q4IF{mh%MA=i1OHFy@5*_}d7eBjrT?EHIH6_`i^61a1zL-sg#$ld LQZunJg^B$SUeR^9 literal 0 HcmV?d00001