Merge pull request #21331 from calixteman/fix_cjk_file

Load the predefined CMap for composite fonts that omit the FontDescriptor
This commit is contained in:
calixteman 2026-05-25 16:40:11 +02:00 committed by GitHub
commit f82382e010
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 378 additions and 0 deletions

View File

@ -4436,6 +4436,13 @@ class PartialEvaluator {
// FontDescriptor is only required for Type3 fonts when the document
// is a tagged pdf.
descriptor = Dict.empty;
} else if (composite) {
// Some PDFs omit the FontDescriptor on the descendant CIDFont when
// referencing one of the standard Acrobat CJK fonts via a predefined
// CMap (e.g. /Encoding /90ms-RKSJ-H with /BaseFont /HeiseiMin-W3).
// Fall through so the CMap is loaded by the composite-font path
// below; otherwise multi-byte codes would be decoded byte-by-byte.
descriptor = Dict.empty;
} else {
// Before PDF 1.5 if the font was one of the base 14 fonts, having a
// FontDescriptor was not required.

View File

@ -21,6 +21,10 @@ const NORMAL = {
style: "normal",
weight: "normal",
};
const MEDIUM = {
style: "normal",
weight: "500",
};
const BOLD = {
style: "normal",
weight: "bold",
@ -364,6 +368,194 @@ const substitutionMap = new Map([
alias: "\xCB\xCE\xCC\xE5",
},
],
// Standard Acrobat CJK fonts. These BaseFont names appear in PDFs that
// don't embed a CJK font and rely on the reader having Acrobat's bundled
// CJK fonts installed.
// Adobe-Japan1 - Mincho (serif).
[
"HeiseiMin-W3",
{
local: [
"Hiragino Mincho ProN",
"Hiragino Mincho Pro",
"Yu Mincho",
"YuMincho",
"Source Han Serif JP",
"Noto Serif JP",
"Noto Serif CJK JP",
"IPAexMincho",
"IPAMincho",
"Takao Mincho",
"MS Mincho",
"MS PMincho",
],
style: NORMAL,
ultimate: "serif",
},
],
// Adobe-Japan1 - Gothic (sans-serif).
[
"HeiseiKakuGo-W5",
{
local: [
"Hiragino Kaku Gothic ProN",
"Hiragino Kaku Gothic Pro",
"Hiragino Sans",
"Yu Gothic",
"YuGothic",
"Source Han Sans JP",
"Noto Sans JP",
"Noto Sans CJK JP",
"IPAexGothic",
"IPAGothic",
"Takao Gothic",
"Meiryo",
"MS Gothic",
"MS PGothic",
],
style: MEDIUM,
ultimate: "sans-serif",
},
],
// Common Adobe-Japan1 variants and Kozuka names.
["HeiseiMin-W3-Acro", { alias: "HeiseiMin-W3" }],
["HeiseiKakuGo-W5-Acro", { alias: "HeiseiKakuGo-W5" }],
["KozMinPro-Regular", { alias: "HeiseiMin-W3" }],
["KozMinProVI-Regular", { alias: "HeiseiMin-W3" }],
["KozMinPr6N-Regular", { alias: "HeiseiMin-W3" }],
["KozGoPro-Regular", { alias: "HeiseiKakuGo-W5" }],
["KozGoProVI-Regular", { alias: "HeiseiKakuGo-W5" }],
["KozGoPr6N-Regular", { alias: "HeiseiKakuGo-W5" }],
// Adobe-GB1 - Song (Simplified Chinese serif).
[
"STSong-Light",
{
local: [
"STSong",
"Songti SC",
"Source Han Serif SC",
"Source Han Serif CN",
"Noto Serif SC",
"Noto Serif CJK SC",
"AR PL UMing CN",
"SimSun",
"NSimSun",
],
style: NORMAL,
ultimate: "serif",
},
],
// Adobe-GB1 - Hei (Simplified Chinese sans-serif).
[
"STHeiti-Regular",
{
local: [
"STHeiti",
"Heiti SC",
"PingFang SC",
"Source Han Sans SC",
"Source Han Sans CN",
"Noto Sans SC",
"Noto Sans CJK SC",
"Microsoft YaHei",
"SimHei",
"WenQuanYi Zen Hei",
],
style: NORMAL,
ultimate: "sans-serif",
},
],
["STSongStd-Light", { alias: "STSong-Light" }],
["AdobeSongStd-Light", { alias: "STSong-Light" }],
["AdobeHeitiStd-Regular", { alias: "STHeiti-Regular" }],
// KaiTi (regular script) and FangSong (imitation Song) are different
// typographic styles; route to the existing GB2312-keyed entries above.
["AdobeKaitiStd-Regular", { alias: "\xBF\xAC\xCC\xE5" }],
["AdobeFangsongStd-Regular", { alias: "\xB7\xC2\xCB\xCE" }],
// Adobe-CNS1 - Sung (Traditional Chinese serif).
[
"MSung-Light",
{
local: [
"Songti TC",
"LiSong Pro",
"Source Han Serif TC",
"Source Han Serif TW",
"Noto Serif TC",
"Noto Serif CJK TC",
"AR PL UMing TW",
"PMingLiU",
"MingLiU",
"MingLiU_HKSCS",
],
style: NORMAL,
ultimate: "serif",
},
],
// Adobe-CNS1 - Hei (Traditional Chinese sans-serif).
[
"MHei-Medium",
{
local: [
"Heiti TC",
"STHeiti",
"Source Han Sans TC",
"Source Han Sans TW",
"Noto Sans TC",
"Noto Sans CJK TC",
"PingFang TC",
"Microsoft JhengHei",
],
style: MEDIUM,
ultimate: "sans-serif",
},
],
["MSungStd-Light", { alias: "MSung-Light" }],
["AdobeMingStd-Light", { alias: "MSung-Light" }],
// Adobe-Korea1 - Myeongjo (Korean serif).
[
"HYSMyeongJo-Medium",
{
local: [
"AppleMyungjo",
"Source Han Serif KR",
"Noto Serif KR",
"Noto Serif CJK KR",
"Nanum Myeongjo",
"Batang",
],
style: MEDIUM,
ultimate: "serif",
},
],
// Adobe-Korea1 - Gothic (Korean sans-serif).
[
"HYGoThic-Medium",
{
local: [
"Apple SD Gothic Neo",
"AppleGothic",
"Source Han Sans KR",
"Noto Sans KR",
"Noto Sans CJK KR",
"Nanum Gothic",
"Malgun Gothic",
"Dotum",
"Gulim",
],
style: MEDIUM,
ultimate: "sans-serif",
},
],
["HYSMyeongJoStd-Medium", { alias: "HYSMyeongJo-Medium" }],
["AdobeMyungjoStd-Medium", { alias: "HYSMyeongJo-Medium" }],
// Bold variants reuse the same fallback list with a bold style override
// so the @font-face declaration requests a bold local() match.
["HYGoThic-Bold", { alias: "HYGoThic-Medium", style: BOLD }],
["AdobeGothicStd-Bold", { alias: "HYGoThic-Medium", style: BOLD }],
]);
const fontAliases = new Map([["Arial-Black", "ArialBlack"]]);

View File

@ -924,3 +924,4 @@
!Embedded_font.pdf
!issue18548_reduced.pdf
!issue_cff_unsigned_bbox.pdf
!90ms_rksj_h_sample.pdf

View File

@ -0,0 +1,67 @@
%PDF-1.4
%âãÏÓ
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources << /Font << /F0 5 0 R /F1 7 0 R >> >> >>
endobj
4 0 obj
<< /Length 92 >>
stream
BT
/F0 14 Tf
72 720 Td
(Hello ASCII) Tj
0 -28 Td
/F1 14 Tf
<93FA967B8CEA836583588367> Tj
ET
endstream
endobj
5 0 obj
<< /Type /Font /Subtype /Type1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding >>
endobj
6 0 obj
<< /Type /Font /Subtype /CIDFontType2
/BaseFont /HeiseiMin-W3
/CIDSystemInfo << /Registry (Adobe) /Ordering (Japan1) /Supplement 2 >>
/DW 1000 >>
endobj
7 0 obj
<< /Type /Font /Subtype /Type0
/BaseFont /HeiseiMin-W3
/Encoding /90ms-RKSJ-H
/DescendantFonts [6 0 R] >>
endobj
xref
0 8
0000000000 65535 f
0000000016 00000 n
0000000066 00000 n
0000000124 00000 n
0000000267 00000 n
0000000410 00000 n
0000000514 00000 n
0000000685 00000 n
trailer
<< /Size 8 /Root 1 0 R >>
startxref
816
%%EOF

View File

@ -4009,6 +4009,25 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});
it("gets text content from a Type0 composite font with no FontDescriptor, using a predefined CMap", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("90ms_rksj_h_sample.pdf", {
cMapUrl: CMAP_URL,
useWorkerFetch: false,
})
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent({
disableNormalization: true,
});
const text = mergeText(items);
expect(text).toEqual("Hello ASCII\n日本語テスト");
await loadingTask.destroy();
});
it("gets text content with a rised text", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf"));
const pdfDoc = await loadingTask.promise;

View File

@ -546,4 +546,96 @@ describe("getFontSubstitution", function () {
/^"ArialBlack",g_d(\d+)_sf(\d+),sans-serif$/
);
});
it("should substitute HeiseiMin-W3", () => {
const fontName = "HeiseiMin-W3";
const fontSubstitution = getFontSubstitution(
new Map(),
idFactory,
localFontPath,
fontName,
undefined,
"CIDFontType2"
);
expect(fontSubstitution).toEqual(
jasmine.objectContaining({
guessFallback: false,
baseFontName: "HeiseiMin-W3",
src:
"local(Hiragino Mincho ProN),local(Hiragino Mincho Pro)," +
"local(Yu Mincho),local(YuMincho),local(Source Han Serif JP)," +
"local(Noto Serif JP),local(Noto Serif CJK JP)," +
"local(IPAexMincho),local(IPAMincho),local(Takao Mincho)," +
"local(MS Mincho),local(MS PMincho)",
style: {
style: "normal",
weight: "normal",
},
})
);
expect(fontSubstitution.css).toMatch(
/^"HeiseiMin W3",g_d(\d+)_sf(\d+),serif$/
);
});
it("should substitute a Kozuka Mincho alias", () => {
const fontName = "KozMinPr6N-Regular";
const fontSubstitution = getFontSubstitution(
new Map(),
idFactory,
localFontPath,
fontName,
undefined,
"CIDFontType0"
);
expect(fontSubstitution).toEqual(
jasmine.objectContaining({
guessFallback: false,
baseFontName: "KozMinPr6N-Regular",
src:
"local(Hiragino Mincho ProN),local(Hiragino Mincho Pro)," +
"local(Yu Mincho),local(YuMincho),local(Source Han Serif JP)," +
"local(Noto Serif JP),local(Noto Serif CJK JP)," +
"local(IPAexMincho),local(IPAMincho),local(Takao Mincho)," +
"local(MS Mincho),local(MS PMincho)",
style: {
style: "normal",
weight: "normal",
},
})
);
expect(fontSubstitution.css).toMatch(
/^"KozMinPr6N",g_d(\d+)_sf(\d+),serif$/
);
});
it("should substitute HYGoThic-Medium", () => {
const fontName = "HYGoThic-Medium";
const fontSubstitution = getFontSubstitution(
new Map(),
idFactory,
localFontPath,
fontName,
undefined,
"CIDFontType2"
);
expect(fontSubstitution).toEqual(
jasmine.objectContaining({
guessFallback: false,
baseFontName: "HYGoThic-Medium",
src:
"local(Apple SD Gothic Neo),local(AppleGothic)," +
"local(Source Han Sans KR),local(Noto Sans KR)," +
"local(Noto Sans CJK KR),local(Nanum Gothic)," +
"local(Malgun Gothic),local(Dotum),local(Gulim)",
style: {
style: "normal",
weight: "500",
},
})
);
expect(fontSubstitution.css).toMatch(
/^"HYGoThic",g_d(\d+)_sf(\d+),sans-serif$/
);
});
});