From 91033c2199888dd36f2ac0f373859748ddaab4d5 Mon Sep 17 00:00:00 2001 From: calixteman Date: Tue, 23 Dec 2025 14:05:17 +0100 Subject: [PATCH] Fix the encoding for some missing chinese fonts It fixes #20489. --- src/core/evaluator.js | 47 +++++++++++++++++++++++++- src/core/font_substitutions.js | 58 +++++++++++++++++++++++++++++++++ test/pdfs/.gitignore | 1 + test/pdfs/issue20489.pdf | Bin 0 -> 7337 bytes test/test_manifest.json | 7 ++++ 5 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 test/pdfs/issue20489.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 55fb31669..536a5682a 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3588,7 +3588,7 @@ class PartialEvaluator { if (properties.composite) { // CIDSystemInfo helps to match CID to glyphs const cidSystemInfo = dict.get("CIDSystemInfo"); - if (cidSystemInfo instanceof Dict) { + if (cidSystemInfo instanceof Dict && !properties.cidSystemInfo) { properties.cidSystemInfo = { registry: stringToPDFString(cidSystemInfo.get("Registry")), ordering: stringToPDFString(cidSystemInfo.get("Ordering")), @@ -3670,6 +3670,51 @@ class PartialEvaluator { baseEncodingName = null; } + // Ignore incorrectly specified WinAnsiEncoding for non-embedded CJK fonts + // (fixes issue20489). Some chinese fonts often have WinAnsiEncoding in the + // PDF even though they should use Identity-H or GB-EUC-H encoding. + if ( + baseEncodingName === "WinAnsiEncoding" && + nonEmbeddedFont && + properties.name?.charCodeAt(0) >= 0xb7 + ) { + const fontName = properties.name; + // This list is built from some names from Pdfium and mupdf: + // - https://pdfium.googlesource.com/pdfium/+/master/core/fpdfapi/font/cpdf_font.cpp#41 + // - https://fossies.org/linux/mupdf/source/pdf/pdf-font.c#l_820 + const chineseFontNames = [ + "\xCB\xCE\xCC\xE5", // SimSun + "\xBA\xDA\xCC\xE5", // SimHei + "\xBF\xAC\xCC\xE5", // SimKai + "\xB7\xC2\xCB\xCE", // SimFang + "\xBF\xAC\xCC\xE5_GB2312", // SimKai + "\xB7\xC2\xCB\xCE_GB2312", // SimFang + "\xC1\xA5\xCA\xE9", // SimLi + "\xD0\xC2\xCB\xCE", // SimSun + ]; + + // Check for common Chinese font names and their GBK-encoded equivalents + // (which may appear as Latin-1 when incorrectly decoded). + if (chineseFontNames.includes(fontName)) { + baseEncodingName = null; + properties.defaultEncoding = "Adobe-GB1-UCS2"; + properties.composite = true; + properties.cidEncoding = Name.get("GBK-EUC-H"); + const cMap = await CMapFactory.create({ + encoding: properties.cidEncoding, + fetchBuiltInCMap: this._fetchBuiltInCMapBound, + useCMap: null, + }); + properties.cMap = cMap; + properties.vertical = properties.cMap.vertical; + properties.cidSystemInfo = { + registry: "Adobe", + ordering: "GB1", + supplement: 0, + }; + } + } + if (baseEncodingName) { properties.defaultEncoding = getEncoding(baseEncodingName); } else { diff --git a/src/core/font_substitutions.js b/src/core/font_substitutions.js index 1e87fabac..7325087b3 100644 --- a/src/core/font_substitutions.js +++ b/src/core/font_substitutions.js @@ -306,6 +306,64 @@ const substitutionMap = new Map([ alias: "Wingdings", }, ], + [ + "\xCB\xCE\xCC\xE5", + { + local: ["SimSun", "SimSun Regular", "NSimSun"], + style: NORMAL, + ultimate: "serif", + }, + ], + [ + "\xBA\xDA\xCC\xE5", + { + local: ["SimHei", "SimHei Regular"], + style: NORMAL, + ultimate: "sans-serif", + }, + ], + [ + "\xBF\xAC\xCC\xE5", + { + local: ["KaiTi", "SimKai", "SimKai Regular"], + style: NORMAL, + ultimate: "sans-serif", + }, + ], + [ + "\xB7\xC2\xCB\xCE", + { + local: ["FangSong", "SimFang", "SimFang Regular"], + style: NORMAL, + ultimate: "serif", + }, + ], + [ + "\xBF\xAC\xCC\xE5_GB2312", + { + alias: "\xBF\xAC\xCC\xE5", + }, + ], + [ + "\xB7\xC2\xCB\xCE_GB2312", + { + alias: "\xB7\xC2\xCB\xCE", + }, + ], + [ + "\xC1\xA5\xCA\xE9", + { + local: ["SimLi", "SimLi Regular"], + style: NORMAL, + ultimate: "serif", + }, + ], + [ + "\xD0\xC2\xCB\xCE", + { + alias: "\xCB\xCE\xCC\xE5", + }, + ], ]); const fontAliases = new Map([["Arial-Black", "ArialBlack"]]); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 352366971..6080768c9 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -767,3 +767,4 @@ !issue20225.pdf !issue20513.pdf !issue20516.pdf +!issue20489.pdf diff --git a/test/pdfs/issue20489.pdf b/test/pdfs/issue20489.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1261d4d0b8723aedbaaecacdcc8855454e3a578f GIT binary patch literal 7337 zcmeHMc~}$I+E+vrs>KDDx(-E5f`pk&l1UTODk@mpn~5;(O$2Lwu(X#MeBwuE+FbXXCQ#)wten*pQqm+*W@|R%)IA4?{dz} z@4WAO9OV)jljVZjIBwZjb;*W>V2Czm)<7FKH#8(&N1|e-QJJWXN9D?R(txl*w*oop5sqK!$Rpo#oA8hIF{l@x(#|l;Vyj9AUu;QXn24s9dj&4JM5Uw1Xl9kSeuOq)m$i1Pn0p`3Rqb zN26lBR`)N+3awFTBoPewsWAe(3xBudCwt%?Mr3`RYvOtMMK|2fic%>~(@qxZsZ5C1M?;?o6kyU^)ZD<+Pf zTin;q=jNWOyv3@4c7-Cxw>f@KhU(7*`L5#F`V9Zde|chSOqqLgYn!0{1|b_cj`4%r zk=KmMfR>-XSX|gp5ubCs?1i>vI= z5vrVs)XB@D3Th6nQ+nU>7lroU9Vy^n^?sbIP4T%p_K+rGg5x42=cn0jj-utW@2)MK zn6OKB`GNQOT^n~aeDhrMShZ^Ele(f$*EnzX1kECcx*mQ}7)AudHR4wHwCzMh!g>U9|Rv?Xnc_tI@sA(EBDYjJtcdr7+B6<+S?i1s6}#kWVfx zzvi>z)R_A_JgrCh714v#YXW_iSTYUizN6P+bw{1&fSVEhl0pY(fa}d~m!1obxVhtA zZt`sT&&^ebcW=9Uw6ZPjgxA?jxM&$>L1&V&iIwMeDTSKBF^@b*Et*a z`02lKoYC<4n_F#to0^YLs9m;Lz6cB0H)W`zvHi5W>yFA%%2SO;?6cLoE+lTKYkSoA zf_1$dh-^#F z*E8=&^|JU|T{ z|H}Loy$%g8uZ+CiKUds)`{=qQ%X?4WlKzG3iSm(~S3PjfIp>x7#_M*UU!tM|tT-9& zlS6%MuKBJXS-7)x&C_SC`qe*duAiEIY=qOXW8=!h74$Kb+fVrsU4xD@i-rUJWh||7?18?W`v)^B!-$xMxwe z#Zy7_3OzjrOK6a(&P4dir@cdkW@5Q>f1BYs(j$AF=x3gi9U7q~{J4Qyk)40N3qqC8! zt!im6*PFB~boE=h&@;>`%R2&#@(r;ROU<8YECz8msLjUk481F>kISS2Q7)3JwUqANOo= z%X={1BaMfz_bq%SEe~Bp9xmGOs*FH|2K60>ypx$(|%n}%g@2W89N0A2-wA{x5f@YwPNi zZ|h3gy9XW`pFum(KEuXB%4r|mh%9}v&bkt7UfQo7&ztl~p7Yz$-;EEY$9Y*D?R&?1 z%$TpeBfnI7x8;PDJs#TcA4x&grDIP`*o*&yBf7bB*o<*7CDHe4q~x`l#5C=V5?PY;?Z!mMyN5UyPjogy}>9>Q0fsjhWaZz z?{O{{aH@<6U~?CMKnXSum;a&tKhD92HTds+|FCE(@RyK=SiM?j)ansF6@ho>81UH) zDZ2Slp&=G#6g)m3o_$-)INERP4#J*7-UWde|lFnMqwM1c-F zD7<+GUwlC7rH&P9;uB$>5Nsrq!azPWwNnp{3%V_ZS&EsZe}rkQZcG#E)yl+BjT+_+ z0m1(Xs&RR21U6tA)z};k!etRu<6}5Nut8$P!x5ZMP|eg&@k3hYIMbLvyZ#7vJf?d% z53s?0c!l~+eWt6)M$>@&^o?C}?{RX8sLuqlzwO_WlvN z;oayKtWF9})-a`MM$vx)ZSb>#t>>rr;r?AH`YYxGLhfIo=Yx*Fer^A3&i@6pbrhg` zK$}^;8RCyQwEZ7KHV(`i%_Fv+`10>ynpxGGs{3|P}~M{4`RniP;SSgK}{W$$H5=I5D3BPvHYE<0@V417}UQ(R8JQo;zV@}JSEWS zwefmo5(CuaK^JgQ0t)7I$z&YExfmN`voIDPV>4iFUzHq7>gkcDk^br!y;7f!;4T=8 zsmGbTXapqvDs8F(5u~AkAq-TQY)k;kLJg#Ta#vqq2rA&B0zLxC3^YV-OeE=nT7gmKabpBm?y$(^G-7270Gb1{$DM^{|k|!uc$M%jd8sVSE~f(ZKX{ZRtIaJbc7q zbb{ajcceTifwMR=oDgya7{S3ut&L^>6iuaL@pBy=HPDrB)R zk(ec8$#6DUMFES;11bdq8HVvhJvH_`jgV+#fl`FkaY{p#Jb>-$I5ny2oZde)zGp=y zTpm1wlGSk>((PiJo{X~r?*M`!v4! zSR5Y1yvN7EL1OqW4Mdk_^4M%HaO_<^HWvg(@6kA1bNe_vkP^Jd$K|s@n)j|e0>jK~ zA=q5-vAoB}6_}6ZnfpdSfa@A-)GO7Aq#hmyDzsW~7)J$?Xvaxk$H5mGtj-|8qM$yK zNH+mr!eK{A_v E0x@%o