diff --git a/README.md b/README.md index ad3b9276b..41f6d7d4d 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,10 @@ PDF.js is built into version 19+ of Firefox. Chrome, go to `Tools > Extension` and load the (unpackaged) extension from the directory `build/chromium`. +### PDF debugger + +Browser the internal structure of a PDF document with https://mozilla.github.io/pdf.js/internal-viewer/web/pdf_internal_viewer.html + ## Getting the Code To get a local copy of the current code, clone it using git: diff --git a/eslint.config.mjs b/eslint.config.mjs index 52ef76677..0afd3259c 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -82,6 +82,7 @@ export default [ ...globals.worker, PDFJSDev: "readonly", __raw_import__: "readonly", + __eager_import__: "readonly", }, ecmaVersion: 2025, diff --git a/external/builder/babel-plugin-pdfjs-preprocessor.mjs b/external/builder/babel-plugin-pdfjs-preprocessor.mjs index 347204513..99efc8eb4 100644 --- a/external/builder/babel-plugin-pdfjs-preprocessor.mjs +++ b/external/builder/babel-plugin-pdfjs-preprocessor.mjs @@ -190,6 +190,19 @@ function babelPluginPDFJSPreprocessor(babel, ctx) { }, ]; path.replaceWith(t.importExpression(source)); + } else if (t.isIdentifier(node.callee, { name: "__eager_import__" })) { + if (node.arguments.length !== 1) { + throw new Error("Invalid `__eager_import__` usage."); + } + // Replace it with a standard `import`-call and inline the module. + const source = node.arguments[0]; + source.leadingComments = [ + { + type: "CommentBlock", + value: "webpackMode: 'eager'", + }, + ]; + path.replaceWith(t.importExpression(source)); } }, "BlockStatement|StaticBlock": { diff --git a/gulpfile.mjs b/gulpfile.mjs index 9ed5715b8..751347384 100644 --- a/gulpfile.mjs +++ b/gulpfile.mjs @@ -119,6 +119,7 @@ const DEFINES = Object.freeze({ COMPONENTS: false, LIB: false, IMAGE_DECODERS: false, + INTERNAL_VIEWER: false, }); function transform(charEncoding, transformFunction) { @@ -2410,7 +2411,7 @@ gulp.task( "internal-viewer", gulp.series(createBuildNumber, function createInternalViewer() { console.log("\n### Creating internal viewer"); - const defines = { ...DEFINES, GENERIC: true }; + const defines = { ...DEFINES, GENERIC: true, INTERNAL_VIEWER: true }; return buildInternalViewer(defines, INTERNAL_VIEWER_DIR); }) ); diff --git a/src/core/document.js b/src/core/document.js index f0a291e95..1dd6345ad 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -22,7 +22,6 @@ import { isArrayEqual, makeArr, objectSize, - OPS, PageActionEventType, RenderingIntentFlag, shadow, @@ -38,17 +37,6 @@ import { PopupAnnotation, WidgetAnnotation, } from "./annotation.js"; -import { - Cmd, - Dict, - EOF, - isName, - isRefsEqual, - Name, - Ref, - RefSet, - RefSetCache, -} from "./primitives.js"; import { collectActions, getInheritableProperty, @@ -63,9 +51,16 @@ import { XRefEntryException, XRefParseException, } from "./core_utils.js"; -import { EvaluatorPreprocessor, PartialEvaluator } from "./evaluator.js"; +import { + Dict, + isName, + isRefsEqual, + Name, + Ref, + RefSet, + RefSetCache, +} from "./primitives.js"; import { getXfaFontDict, getXfaFontName } from "./xfa_fonts.js"; -import { Lexer, Linearization, Parser } from "./parser.js"; import { NullStream, Stream } from "./stream.js"; import { BaseStream } from "./base_stream.js"; import { calculateMD5 } from "./calculate_md5.js"; @@ -73,9 +68,11 @@ import { Catalog } from "./catalog.js"; import { clearGlobalCaches } from "./cleanup_helper.js"; import { DatasetReader } from "./dataset_reader.js"; import { Intersector } from "./intersector.js"; +import { Linearization } from "./parser.js"; import { LocalColorSpaceCache } from "./image_utils.js"; import { ObjectLoader } from "./object_loader.js"; import { OperatorList } from "./operator_list.js"; +import { PartialEvaluator } from "./evaluator.js"; import { PDFFunctionFactory } from "./function.js"; import { PDFImage } from "./image.js"; import { StreamsSequenceStream } from "./decode_stream.js"; @@ -2038,9 +2035,16 @@ class PDFDocument { } async toJSObject(value, firstCall = true) { - if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) { + if ( + typeof PDFJSDev !== "undefined" && + !PDFJSDev.test("TESTING || INTERNAL_VIEWER") + ) { throw new Error("Not implemented: toJSObject"); } + const { InternalViewerUtils } = + typeof PDFJSDev === "undefined" + ? await import("./internal_viewer_utils.js") + : await __eager_import__("./internal_viewer_utils.js"); if (value === null && firstCall) { return this.toJSObject(this.xref.trailer, false); @@ -2051,7 +2055,7 @@ class PDFDocument { for (const [key, val] of value.getRawEntries()) { obj[key] = isPage && key === "Contents" - ? _getContentTokens(val, this.xref) + ? InternalViewerUtils.getContentTokens(val, this.xref) : await this.toJSObject(val, false); } return obj; @@ -2109,9 +2113,10 @@ class PDFDocument { if (isName(dict.get("Subtype"), "Form")) { obj.bytes = value.getString(); value.reset(); - const { instructions, cmdNames } = _groupIntoInstructions( - _tokenizeStream(value, this.xref) - ); + const { instructions, cmdNames } = + InternalViewerUtils.groupIntoInstructions( + InternalViewerUtils.tokenizeStream(value, this.xref) + ); obj.contentStream = true; obj.instructions = instructions; obj.cmdNames = cmdNames; @@ -2125,130 +2130,4 @@ class PDFDocument { } } -function _tokenizeStream(stream, xref) { - const tokens = []; - const parser = new Parser({ - lexer: new Lexer(stream), - xref, - allowStreams: false, - }); - while (true) { - let obj; - try { - obj = parser.getObj(); - } catch { - break; - } - if (obj === EOF) { - break; - } - const token = _tokenToJSObject(obj); - if (token !== null) { - tokens.push(token); - } - } - return tokens; -} - -function _getContentTokens(contentsVal, xref) { - const refs = Array.isArray(contentsVal) ? contentsVal : [contentsVal]; - const rawContents = []; - const tokens = []; - for (const rawRef of refs) { - if (rawRef instanceof Ref) { - rawContents.push({ num: rawRef.num, gen: rawRef.gen }); - } - const stream = xref.fetchIfRef(rawRef); - if (!(stream instanceof BaseStream)) { - continue; - } - tokens.push(..._tokenizeStream(stream, xref)); - } - const { instructions, cmdNames } = _groupIntoInstructions(tokens); - return { contentStream: true, instructions, cmdNames, rawContents }; -} - -// Lazily-built reverse map: OPS numeric id → property name string. -let _opsIdToName = null; - -function _getOpsIdToName() { - if (!_opsIdToName) { - _opsIdToName = Object.create(null); - for (const [name, id] of Object.entries(OPS)) { - _opsIdToName[id] = name; - } - } - return _opsIdToName; -} - -function _groupIntoInstructions(tokens) { - const { opMap } = EvaluatorPreprocessor; - const opsIdToName = _getOpsIdToName(); - const instructions = []; - const cmdNames = Object.create(null); - const argBuffer = []; - for (const token of tokens) { - if (token.type !== "cmd") { - argBuffer.push(token); - continue; - } - const op = opMap[token.value]; - if (op && !(token.value in cmdNames)) { - cmdNames[token.value] = opsIdToName[op.id]; - } - let args; - if (!op || op.variableArgs) { - // Unknown command or variable args: consume all pending args. - args = argBuffer.splice(0); - } else { - // Fixed args: consume exactly numArgs, orphan the rest. - const orphanCount = Math.max(0, argBuffer.length - op.numArgs); - for (let i = 0; i < orphanCount; i++) { - instructions.push({ cmd: null, args: [argBuffer.shift()] }); - } - args = argBuffer.splice(0); - } - instructions.push({ cmd: token.value, args }); - } - for (const t of argBuffer) { - instructions.push({ cmd: null, args: [t] }); - } - return { instructions, cmdNames }; -} - -function _tokenToJSObject(obj) { - if (obj instanceof Cmd) { - return { type: "cmd", value: obj.cmd }; - } - if (obj instanceof Name) { - return { type: "name", value: obj.name }; - } - if (obj instanceof Ref) { - return { type: "ref", num: obj.num, gen: obj.gen }; - } - if (Array.isArray(obj)) { - return { type: "array", value: obj.map(_tokenToJSObject) }; - } - if (obj instanceof Dict) { - const result = Object.create(null); - for (const [key, val] of obj.getRawEntries()) { - result[key] = _tokenToJSObject(val); - } - return { type: "dict", value: result }; - } - if (typeof obj === "number") { - return { type: "number", value: obj }; - } - if (typeof obj === "string") { - return { type: "string", value: obj }; - } - if (typeof obj === "boolean") { - return { type: "boolean", value: obj }; - } - if (obj === null) { - return { type: "null" }; - } - return null; -} - export { Page, PDFDocument }; diff --git a/src/core/internal_viewer_utils.js b/src/core/internal_viewer_utils.js new file mode 100644 index 000000000..f5c8877d8 --- /dev/null +++ b/src/core/internal_viewer_utils.js @@ -0,0 +1,152 @@ +/* Copyright 2026 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { Cmd, Dict, EOF, Name, Ref } from "./primitives.js"; +import { Lexer, Parser } from "./parser.js"; +import { OPS, shadow } from "../shared/util.js"; +import { BaseStream } from "./base_stream.js"; +import { EvaluatorPreprocessor } from "./evaluator.js"; + +if ( + typeof PDFJSDev !== "undefined" && + !PDFJSDev.test("TESTING || INTERNAL_VIEWER") +) { + throw new Error("Not implemented: InternalViewerUtils"); +} + +const InternalViewerUtils = { + tokenizeStream(stream, xref) { + const tokens = []; + const parser = new Parser({ + lexer: new Lexer(stream), + xref, + allowStreams: false, + }); + while (true) { + let obj; + try { + obj = parser.getObj(); + } catch { + break; + } + if (obj === EOF) { + break; + } + const token = this.tokenToJSObject(obj); + if (token !== null) { + tokens.push(token); + } + } + return tokens; + }, + + getContentTokens(contentsVal, xref) { + const refs = Array.isArray(contentsVal) ? contentsVal : [contentsVal]; + const rawContents = []; + const tokens = []; + for (const rawRef of refs) { + if (rawRef instanceof Ref) { + rawContents.push({ num: rawRef.num, gen: rawRef.gen }); + } + const stream = xref.fetchIfRef(rawRef); + if (!(stream instanceof BaseStream)) { + continue; + } + tokens.push(...this.tokenizeStream(stream, xref)); + } + const { instructions, cmdNames } = this.groupIntoInstructions(tokens); + return { contentStream: true, instructions, cmdNames, rawContents }; + }, + + // Lazily-built reverse map: OPS numeric id → property name string. + get opsIdToName() { + const opsIdToName = Object.create(null); + for (const [name, id] of Object.entries(OPS)) { + opsIdToName[id] = name; + } + return shadow(this, "opsIdToName", opsIdToName); + }, + + groupIntoInstructions(tokens) { + const { opMap } = EvaluatorPreprocessor; + const instructions = []; + const cmdNames = Object.create(null); + const argBuffer = []; + for (const token of tokens) { + if (token.type !== "cmd") { + argBuffer.push(token); + continue; + } + const op = opMap[token.value]; + if (op && !(token.value in cmdNames)) { + cmdNames[token.value] = this.opsIdToName[op.id]; + } + let args; + if (!op || op.variableArgs) { + // Unknown command or variable args: consume all pending args. + args = argBuffer.splice(0); + } else { + // Fixed args: consume exactly numArgs, orphan the rest. + const orphanCount = Math.max(0, argBuffer.length - op.numArgs); + for (let i = 0; i < orphanCount; i++) { + instructions.push({ cmd: null, args: [argBuffer.shift()] }); + } + args = argBuffer.splice(0); + } + instructions.push({ cmd: token.value, args }); + } + for (const t of argBuffer) { + instructions.push({ cmd: null, args: [t] }); + } + return { instructions, cmdNames }; + }, + + tokenToJSObject(obj) { + if (obj instanceof Cmd) { + return { type: "cmd", value: obj.cmd }; + } + if (obj instanceof Name) { + return { type: "name", value: obj.name }; + } + if (obj instanceof Ref) { + return { type: "ref", num: obj.num, gen: obj.gen }; + } + if (Array.isArray(obj)) { + return { type: "array", value: obj.map(this.tokenToJSObject) }; + } + if (obj instanceof Dict) { + const result = Object.create(null); + for (const [key, val] of obj.getRawEntries()) { + result[key] = this.tokenToJSObject(val); + } + return { type: "dict", value: result }; + } + if (typeof obj === "number") { + return { type: "number", value: obj }; + } + if (typeof obj === "string") { + return { type: "string", value: obj }; + } + if (typeof obj === "boolean") { + return { type: "boolean", value: obj }; + } + if (obj === null) { + return { type: "null" }; + } + return null; + }, +}; + +export { InternalViewerUtils }; diff --git a/src/core/worker.js b/src/core/worker.js index 59e65e986..44e438a77 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -952,21 +952,25 @@ class WorkerMessageHandler { return pdfManager.fontFallback(data.id, handler); }); - handler.on("GetRawData", async function ({ ref, page }) { - if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) { - throw new Error("Not implemented: GetRawData"); - } - let value = null; - if (page >= 1) { - value = (await pdfManager.ensureCatalog("getPageDict", [page - 1]))[1]; - } else if (ref) { - value = - typeof ref === "string" - ? Ref.fromString(ref) - : Ref.get(ref.num, ref.gen); - } - return pdfManager.ensureDoc("toJSObject", [value]); - }); + if ( + typeof PDFJSDev === "undefined" || + PDFJSDev.test("TESTING || INTERNAL_VIEWER") + ) { + handler.on("GetRawData", async function ({ ref, page }) { + let value = null; + if (page >= 1) { + value = ( + await pdfManager.ensureCatalog("getPageDict", [page - 1]) + )[1]; + } else if (ref) { + value = + typeof ref === "string" + ? Ref.fromString(ref) + : Ref.get(ref.num, ref.gen); + } + return pdfManager.ensureDoc("toJSObject", [value]); + }); + } handler.on("Cleanup", function (data) { return pdfManager.cleanup(/* manuallyTriggered = */ true);