diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 55fb31669..536a5682a 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3588,7 +3588,7 @@ class PartialEvaluator { if (properties.composite) { // CIDSystemInfo helps to match CID to glyphs const cidSystemInfo = dict.get("CIDSystemInfo"); - if (cidSystemInfo instanceof Dict) { + if (cidSystemInfo instanceof Dict && !properties.cidSystemInfo) { properties.cidSystemInfo = { registry: stringToPDFString(cidSystemInfo.get("Registry")), ordering: stringToPDFString(cidSystemInfo.get("Ordering")), @@ -3670,6 +3670,51 @@ class PartialEvaluator { baseEncodingName = null; } + // Ignore incorrectly specified WinAnsiEncoding for non-embedded CJK fonts + // (fixes issue20489). Some chinese fonts often have WinAnsiEncoding in the + // PDF even though they should use Identity-H or GB-EUC-H encoding. + if ( + baseEncodingName === "WinAnsiEncoding" && + nonEmbeddedFont && + properties.name?.charCodeAt(0) >= 0xb7 + ) { + const fontName = properties.name; + // This list is built from some names from Pdfium and mupdf: + // - https://pdfium.googlesource.com/pdfium/+/master/core/fpdfapi/font/cpdf_font.cpp#41 + // - https://fossies.org/linux/mupdf/source/pdf/pdf-font.c#l_820 + const chineseFontNames = [ + "\xCB\xCE\xCC\xE5", // SimSun + "\xBA\xDA\xCC\xE5", // SimHei + "\xBF\xAC\xCC\xE5", // SimKai + "\xB7\xC2\xCB\xCE", // SimFang + "\xBF\xAC\xCC\xE5_GB2312", // SimKai + "\xB7\xC2\xCB\xCE_GB2312", // SimFang + "\xC1\xA5\xCA\xE9", // SimLi + "\xD0\xC2\xCB\xCE", // SimSun + ]; + + // Check for common Chinese font names and their GBK-encoded equivalents + // (which may appear as Latin-1 when incorrectly decoded). + if (chineseFontNames.includes(fontName)) { + baseEncodingName = null; + properties.defaultEncoding = "Adobe-GB1-UCS2"; + properties.composite = true; + properties.cidEncoding = Name.get("GBK-EUC-H"); + const cMap = await CMapFactory.create({ + encoding: properties.cidEncoding, + fetchBuiltInCMap: this._fetchBuiltInCMapBound, + useCMap: null, + }); + properties.cMap = cMap; + properties.vertical = properties.cMap.vertical; + properties.cidSystemInfo = { + registry: "Adobe", + ordering: "GB1", + supplement: 0, + }; + } + } + if (baseEncodingName) { properties.defaultEncoding = getEncoding(baseEncodingName); } else { diff --git a/src/core/font_substitutions.js b/src/core/font_substitutions.js index 1e87fabac..7325087b3 100644 --- a/src/core/font_substitutions.js +++ b/src/core/font_substitutions.js @@ -306,6 +306,64 @@ const substitutionMap = new Map([ alias: "Wingdings", }, ], + [ + "\xCB\xCE\xCC\xE5", + { + local: ["SimSun", "SimSun Regular", "NSimSun"], + style: NORMAL, + ultimate: "serif", + }, + ], + [ + "\xBA\xDA\xCC\xE5", + { + local: ["SimHei", "SimHei Regular"], + style: NORMAL, + ultimate: "sans-serif", + }, + ], + [ + "\xBF\xAC\xCC\xE5", + { + local: ["KaiTi", "SimKai", "SimKai Regular"], + style: NORMAL, + ultimate: "sans-serif", + }, + ], + [ + "\xB7\xC2\xCB\xCE", + { + local: ["FangSong", "SimFang", "SimFang Regular"], + style: NORMAL, + ultimate: "serif", + }, + ], + [ + "\xBF\xAC\xCC\xE5_GB2312", + { + alias: "\xBF\xAC\xCC\xE5", + }, + ], + [ + "\xB7\xC2\xCB\xCE_GB2312", + { + alias: "\xB7\xC2\xCB\xCE", + }, + ], + [ + "\xC1\xA5\xCA\xE9", + { + local: ["SimLi", "SimLi Regular"], + style: NORMAL, + ultimate: "serif", + }, + ], + [ + "\xD0\xC2\xCB\xCE", + { + alias: "\xCB\xCE\xCC\xE5", + }, + ], ]); const fontAliases = new Map([["Arial-Black", "ArialBlack"]]); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 352366971..6080768c9 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -767,3 +767,4 @@ !issue20225.pdf !issue20513.pdf !issue20516.pdf +!issue20489.pdf diff --git a/test/pdfs/issue20489.pdf b/test/pdfs/issue20489.pdf new file mode 100644 index 000000000..1261d4d0b Binary files /dev/null and b/test/pdfs/issue20489.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index c9b9a5400..7a026c5c7 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -13106,5 +13106,12 @@ "md5": "19a3a347773518242fa3cf1c04a9a1e4", "rounds": 1, "type": "eq" + }, + { + "id": "issue20489", + "file": "pdfs/issue20489.pdf", + "md5": "b85c798b9a4cc2cd4337d335321cc612", + "rounds": 1, + "type": "eq" } ]