From 8f85e3f20ba68986cad0943716b47d012fd3e70f Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Mon, 25 May 2026 14:44:42 +0200 Subject: [PATCH] Load the predefined CMap for composite fonts that omit the FontDescriptor and add font substitutions for the standard Acrobat CJK families. --- src/core/evaluator.js | 7 + src/core/font_substitutions.js | 192 +++++++++++++++++++++++++++ test/pdfs/.gitignore | 1 + test/pdfs/90ms_rksj_h_sample.pdf | 67 ++++++++++ test/unit/api_spec.js | 19 +++ test/unit/font_substitutions_spec.js | 92 +++++++++++++ 6 files changed, 378 insertions(+) create mode 100644 test/pdfs/90ms_rksj_h_sample.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 6961d2b00..628c196cd 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -4436,6 +4436,13 @@ class PartialEvaluator { // FontDescriptor is only required for Type3 fonts when the document // is a tagged pdf. descriptor = Dict.empty; + } else if (composite) { + // Some PDFs omit the FontDescriptor on the descendant CIDFont when + // referencing one of the standard Acrobat CJK fonts via a predefined + // CMap (e.g. /Encoding /90ms-RKSJ-H with /BaseFont /HeiseiMin-W3). + // Fall through so the CMap is loaded by the composite-font path + // below; otherwise multi-byte codes would be decoded byte-by-byte. + descriptor = Dict.empty; } else { // Before PDF 1.5 if the font was one of the base 14 fonts, having a // FontDescriptor was not required. diff --git a/src/core/font_substitutions.js b/src/core/font_substitutions.js index 7325087b3..fc7b16c35 100644 --- a/src/core/font_substitutions.js +++ b/src/core/font_substitutions.js @@ -21,6 +21,10 @@ const NORMAL = { style: "normal", weight: "normal", }; +const MEDIUM = { + style: "normal", + weight: "500", +}; const BOLD = { style: "normal", weight: "bold", @@ -364,6 +368,194 @@ const substitutionMap = new Map([ alias: "\xCB\xCE\xCC\xE5", }, ], + // Standard Acrobat CJK fonts. These BaseFont names appear in PDFs that + // don't embed a CJK font and rely on the reader having Acrobat's bundled + // CJK fonts installed. + // Adobe-Japan1 - Mincho (serif). + [ + "HeiseiMin-W3", + { + local: [ + "Hiragino Mincho ProN", + "Hiragino Mincho Pro", + "Yu Mincho", + "YuMincho", + "Source Han Serif JP", + "Noto Serif JP", + "Noto Serif CJK JP", + "IPAexMincho", + "IPAMincho", + "Takao Mincho", + "MS Mincho", + "MS PMincho", + ], + style: NORMAL, + ultimate: "serif", + }, + ], + // Adobe-Japan1 - Gothic (sans-serif). + [ + "HeiseiKakuGo-W5", + { + local: [ + "Hiragino Kaku Gothic ProN", + "Hiragino Kaku Gothic Pro", + "Hiragino Sans", + "Yu Gothic", + "YuGothic", + "Source Han Sans JP", + "Noto Sans JP", + "Noto Sans CJK JP", + "IPAexGothic", + "IPAGothic", + "Takao Gothic", + "Meiryo", + "MS Gothic", + "MS PGothic", + ], + style: MEDIUM, + ultimate: "sans-serif", + }, + ], + // Common Adobe-Japan1 variants and Kozuka names. + ["HeiseiMin-W3-Acro", { alias: "HeiseiMin-W3" }], + ["HeiseiKakuGo-W5-Acro", { alias: "HeiseiKakuGo-W5" }], + ["KozMinPro-Regular", { alias: "HeiseiMin-W3" }], + ["KozMinProVI-Regular", { alias: "HeiseiMin-W3" }], + ["KozMinPr6N-Regular", { alias: "HeiseiMin-W3" }], + ["KozGoPro-Regular", { alias: "HeiseiKakuGo-W5" }], + ["KozGoProVI-Regular", { alias: "HeiseiKakuGo-W5" }], + ["KozGoPr6N-Regular", { alias: "HeiseiKakuGo-W5" }], + + // Adobe-GB1 - Song (Simplified Chinese serif). + [ + "STSong-Light", + { + local: [ + "STSong", + "Songti SC", + "Source Han Serif SC", + "Source Han Serif CN", + "Noto Serif SC", + "Noto Serif CJK SC", + "AR PL UMing CN", + "SimSun", + "NSimSun", + ], + style: NORMAL, + ultimate: "serif", + }, + ], + // Adobe-GB1 - Hei (Simplified Chinese sans-serif). + [ + "STHeiti-Regular", + { + local: [ + "STHeiti", + "Heiti SC", + "PingFang SC", + "Source Han Sans SC", + "Source Han Sans CN", + "Noto Sans SC", + "Noto Sans CJK SC", + "Microsoft YaHei", + "SimHei", + "WenQuanYi Zen Hei", + ], + style: NORMAL, + ultimate: "sans-serif", + }, + ], + ["STSongStd-Light", { alias: "STSong-Light" }], + ["AdobeSongStd-Light", { alias: "STSong-Light" }], + ["AdobeHeitiStd-Regular", { alias: "STHeiti-Regular" }], + // KaiTi (regular script) and FangSong (imitation Song) are different + // typographic styles; route to the existing GB2312-keyed entries above. + ["AdobeKaitiStd-Regular", { alias: "\xBF\xAC\xCC\xE5" }], + ["AdobeFangsongStd-Regular", { alias: "\xB7\xC2\xCB\xCE" }], + + // Adobe-CNS1 - Sung (Traditional Chinese serif). + [ + "MSung-Light", + { + local: [ + "Songti TC", + "LiSong Pro", + "Source Han Serif TC", + "Source Han Serif TW", + "Noto Serif TC", + "Noto Serif CJK TC", + "AR PL UMing TW", + "PMingLiU", + "MingLiU", + "MingLiU_HKSCS", + ], + style: NORMAL, + ultimate: "serif", + }, + ], + // Adobe-CNS1 - Hei (Traditional Chinese sans-serif). + [ + "MHei-Medium", + { + local: [ + "Heiti TC", + "STHeiti", + "Source Han Sans TC", + "Source Han Sans TW", + "Noto Sans TC", + "Noto Sans CJK TC", + "PingFang TC", + "Microsoft JhengHei", + ], + style: MEDIUM, + ultimate: "sans-serif", + }, + ], + ["MSungStd-Light", { alias: "MSung-Light" }], + ["AdobeMingStd-Light", { alias: "MSung-Light" }], + + // Adobe-Korea1 - Myeongjo (Korean serif). + [ + "HYSMyeongJo-Medium", + { + local: [ + "AppleMyungjo", + "Source Han Serif KR", + "Noto Serif KR", + "Noto Serif CJK KR", + "Nanum Myeongjo", + "Batang", + ], + style: MEDIUM, + ultimate: "serif", + }, + ], + // Adobe-Korea1 - Gothic (Korean sans-serif). + [ + "HYGoThic-Medium", + { + local: [ + "Apple SD Gothic Neo", + "AppleGothic", + "Source Han Sans KR", + "Noto Sans KR", + "Noto Sans CJK KR", + "Nanum Gothic", + "Malgun Gothic", + "Dotum", + "Gulim", + ], + style: MEDIUM, + ultimate: "sans-serif", + }, + ], + ["HYSMyeongJoStd-Medium", { alias: "HYSMyeongJo-Medium" }], + ["AdobeMyungjoStd-Medium", { alias: "HYSMyeongJo-Medium" }], + // Bold variants reuse the same fallback list with a bold style override + // so the @font-face declaration requests a bold local() match. + ["HYGoThic-Bold", { alias: "HYGoThic-Medium", style: BOLD }], + ["AdobeGothicStd-Bold", { alias: "HYGoThic-Medium", style: BOLD }], ]); const fontAliases = new Map([["Arial-Black", "ArialBlack"]]); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index e866a0f1f..3da556ba1 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -924,3 +924,4 @@ !Embedded_font.pdf !issue18548_reduced.pdf !issue_cff_unsigned_bbox.pdf +!90ms_rksj_h_sample.pdf diff --git a/test/pdfs/90ms_rksj_h_sample.pdf b/test/pdfs/90ms_rksj_h_sample.pdf new file mode 100644 index 000000000..9bd93307b --- /dev/null +++ b/test/pdfs/90ms_rksj_h_sample.pdf @@ -0,0 +1,67 @@ +%PDF-1.4 +% + +1 0 obj +<< /Type /Catalog /Pages 2 0 R >> +endobj + +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj + +3 0 obj +<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] + /Contents 4 0 R + /Resources << /Font << /F0 5 0 R /F1 7 0 R >> >> >> +endobj + +4 0 obj +<< /Length 92 >> +stream +BT +/F0 14 Tf +72 720 Td +(Hello ASCII) Tj +0 -28 Td +/F1 14 Tf +<93FA967B8CEA836583588367> Tj +ET + +endstream +endobj + +5 0 obj +<< /Type /Font /Subtype /Type1 + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding >> +endobj + +6 0 obj +<< /Type /Font /Subtype /CIDFontType2 + /BaseFont /HeiseiMin-W3 + /CIDSystemInfo << /Registry (Adobe) /Ordering (Japan1) /Supplement 2 >> + /DW 1000 >> +endobj + +7 0 obj +<< /Type /Font /Subtype /Type0 + /BaseFont /HeiseiMin-W3 + /Encoding /90ms-RKSJ-H + /DescendantFonts [6 0 R] >> +endobj + +xref +0 8 +0000000000 65535 f +0000000016 00000 n +0000000066 00000 n +0000000124 00000 n +0000000267 00000 n +0000000410 00000 n +0000000514 00000 n +0000000685 00000 n +trailer +<< /Size 8 /Root 1 0 R >> +startxref +816 +%%EOF diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 60419ed46..bec3cd7f6 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -4009,6 +4009,25 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content from a Type0 composite font with no FontDescriptor, using a predefined CMap", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("90ms_rksj_h_sample.pdf", { + cMapUrl: CMAP_URL, + useWorkerFetch: false, + }) + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + + expect(text).toEqual("Hello ASCII\n日本語テスト"); + + await loadingTask.destroy(); + }); + it("gets text content with a rised text", async function () { const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf")); const pdfDoc = await loadingTask.promise; diff --git a/test/unit/font_substitutions_spec.js b/test/unit/font_substitutions_spec.js index eefdfd9f6..ae20c570d 100644 --- a/test/unit/font_substitutions_spec.js +++ b/test/unit/font_substitutions_spec.js @@ -546,4 +546,96 @@ describe("getFontSubstitution", function () { /^"ArialBlack",g_d(\d+)_sf(\d+),sans-serif$/ ); }); + + it("should substitute HeiseiMin-W3", () => { + const fontName = "HeiseiMin-W3"; + const fontSubstitution = getFontSubstitution( + new Map(), + idFactory, + localFontPath, + fontName, + undefined, + "CIDFontType2" + ); + expect(fontSubstitution).toEqual( + jasmine.objectContaining({ + guessFallback: false, + baseFontName: "HeiseiMin-W3", + src: + "local(Hiragino Mincho ProN),local(Hiragino Mincho Pro)," + + "local(Yu Mincho),local(YuMincho),local(Source Han Serif JP)," + + "local(Noto Serif JP),local(Noto Serif CJK JP)," + + "local(IPAexMincho),local(IPAMincho),local(Takao Mincho)," + + "local(MS Mincho),local(MS PMincho)", + style: { + style: "normal", + weight: "normal", + }, + }) + ); + expect(fontSubstitution.css).toMatch( + /^"HeiseiMin W3",g_d(\d+)_sf(\d+),serif$/ + ); + }); + + it("should substitute a Kozuka Mincho alias", () => { + const fontName = "KozMinPr6N-Regular"; + const fontSubstitution = getFontSubstitution( + new Map(), + idFactory, + localFontPath, + fontName, + undefined, + "CIDFontType0" + ); + expect(fontSubstitution).toEqual( + jasmine.objectContaining({ + guessFallback: false, + baseFontName: "KozMinPr6N-Regular", + src: + "local(Hiragino Mincho ProN),local(Hiragino Mincho Pro)," + + "local(Yu Mincho),local(YuMincho),local(Source Han Serif JP)," + + "local(Noto Serif JP),local(Noto Serif CJK JP)," + + "local(IPAexMincho),local(IPAMincho),local(Takao Mincho)," + + "local(MS Mincho),local(MS PMincho)", + style: { + style: "normal", + weight: "normal", + }, + }) + ); + expect(fontSubstitution.css).toMatch( + /^"KozMinPr6N",g_d(\d+)_sf(\d+),serif$/ + ); + }); + + it("should substitute HYGoThic-Medium", () => { + const fontName = "HYGoThic-Medium"; + const fontSubstitution = getFontSubstitution( + new Map(), + idFactory, + localFontPath, + fontName, + undefined, + "CIDFontType2" + ); + expect(fontSubstitution).toEqual( + jasmine.objectContaining({ + guessFallback: false, + baseFontName: "HYGoThic-Medium", + src: + "local(Apple SD Gothic Neo),local(AppleGothic)," + + "local(Source Han Sans KR),local(Noto Sans KR)," + + "local(Noto Sans CJK KR),local(Nanum Gothic)," + + "local(Malgun Gothic),local(Dotum),local(Gulim)", + style: { + style: "normal", + weight: "500", + }, + }) + ); + expect(fontSubstitution.css).toMatch( + /^"HYGoThic",g_d(\d+)_sf(\d+),sans-serif$/ + ); + }); });