Fix the encoding for some missing chinese fonts

It fixes #20489.
This commit is contained in:
calixteman 2025-12-23 14:05:17 +01:00
parent 9f4db380aa
commit 91033c2199
No known key found for this signature in database
GPG Key ID: 0C5442631EE0691F
5 changed files with 112 additions and 1 deletions

View File

@ -3588,7 +3588,7 @@ class PartialEvaluator {
if (properties.composite) {
// CIDSystemInfo helps to match CID to glyphs
const cidSystemInfo = dict.get("CIDSystemInfo");
if (cidSystemInfo instanceof Dict) {
if (cidSystemInfo instanceof Dict && !properties.cidSystemInfo) {
properties.cidSystemInfo = {
registry: stringToPDFString(cidSystemInfo.get("Registry")),
ordering: stringToPDFString(cidSystemInfo.get("Ordering")),
@ -3670,6 +3670,51 @@ class PartialEvaluator {
baseEncodingName = null;
}
// Ignore incorrectly specified WinAnsiEncoding for non-embedded CJK fonts
// (fixes issue20489). Some chinese fonts often have WinAnsiEncoding in the
// PDF even though they should use Identity-H or GB-EUC-H encoding.
if (
baseEncodingName === "WinAnsiEncoding" &&
nonEmbeddedFont &&
properties.name?.charCodeAt(0) >= 0xb7
) {
const fontName = properties.name;
// This list is built from some names from Pdfium and mupdf:
// - https://pdfium.googlesource.com/pdfium/+/master/core/fpdfapi/font/cpdf_font.cpp#41
// - https://fossies.org/linux/mupdf/source/pdf/pdf-font.c#l_820
const chineseFontNames = [
"\xCB\xCE\xCC\xE5", // SimSun
"\xBA\xDA\xCC\xE5", // SimHei
"\xBF\xAC\xCC\xE5", // SimKai
"\xB7\xC2\xCB\xCE", // SimFang
"\xBF\xAC\xCC\xE5_GB2312", // SimKai
"\xB7\xC2\xCB\xCE_GB2312", // SimFang
"\xC1\xA5\xCA\xE9", // SimLi
"\xD0\xC2\xCB\xCE", // SimSun
];
// Check for common Chinese font names and their GBK-encoded equivalents
// (which may appear as Latin-1 when incorrectly decoded).
if (chineseFontNames.includes(fontName)) {
baseEncodingName = null;
properties.defaultEncoding = "Adobe-GB1-UCS2";
properties.composite = true;
properties.cidEncoding = Name.get("GBK-EUC-H");
const cMap = await CMapFactory.create({
encoding: properties.cidEncoding,
fetchBuiltInCMap: this._fetchBuiltInCMapBound,
useCMap: null,
});
properties.cMap = cMap;
properties.vertical = properties.cMap.vertical;
properties.cidSystemInfo = {
registry: "Adobe",
ordering: "GB1",
supplement: 0,
};
}
}
if (baseEncodingName) {
properties.defaultEncoding = getEncoding(baseEncodingName);
} else {

View File

@ -306,6 +306,64 @@ const substitutionMap = new Map([
alias: "Wingdings",
},
],
[
"\xCB\xCE\xCC\xE5",
{
local: ["SimSun", "SimSun Regular", "NSimSun"],
style: NORMAL,
ultimate: "serif",
},
],
[
"\xBA\xDA\xCC\xE5",
{
local: ["SimHei", "SimHei Regular"],
style: NORMAL,
ultimate: "sans-serif",
},
],
[
"\xBF\xAC\xCC\xE5",
{
local: ["KaiTi", "SimKai", "SimKai Regular"],
style: NORMAL,
ultimate: "sans-serif",
},
],
[
"\xB7\xC2\xCB\xCE",
{
local: ["FangSong", "SimFang", "SimFang Regular"],
style: NORMAL,
ultimate: "serif",
},
],
[
"\xBF\xAC\xCC\xE5_GB2312",
{
alias: "\xBF\xAC\xCC\xE5",
},
],
[
"\xB7\xC2\xCB\xCE_GB2312",
{
alias: "\xB7\xC2\xCB\xCE",
},
],
[
"\xC1\xA5\xCA\xE9",
{
local: ["SimLi", "SimLi Regular"],
style: NORMAL,
ultimate: "serif",
},
],
[
"\xD0\xC2\xCB\xCE",
{
alias: "\xCB\xCE\xCC\xE5",
},
],
]);
const fontAliases = new Map([["Arial-Black", "ArialBlack"]]);

View File

@ -767,3 +767,4 @@
!issue20225.pdf
!issue20513.pdf
!issue20516.pdf
!issue20489.pdf

BIN
test/pdfs/issue20489.pdf Normal file

Binary file not shown.

View File

@ -13106,5 +13106,12 @@
"md5": "19a3a347773518242fa3cf1c04a9a1e4",
"rounds": 1,
"type": "eq"
},
{
"id": "issue20489",
"file": "pdfs/issue20489.pdf",
"md5": "b85c798b9a4cc2cd4337d335321cc612",
"rounds": 1,
"type": "eq"
}
]