[api-minor] Rewrite the ps lexer & parser and add a small Wasm compiler

The main goal is to remove the eval-based interpreter. In order to have some good performances, the new parser performs some optimizations on the AST (similar to the ones in the previous implementation), and the Wasm compiler generates code for the optimized AST. For now, in case of errors or unsupported features, the Wasm compiler returns null and the old interpreter is used as a fallback. Few things are still missing: - a wasm-based interpreter using a stack (in case the ps code isn't stack-free); - a better js implementation in case of disabled wasm. but they will be added in follow-up patches.
2026-07-26 08:57:21 +02:00 · 2026-03-27 18:35:17 +01:00 · 2026-03-27 18:35:17 +01:00 · 952952c905
commit 952952c905
parent cb2640dc33
8 changed files with 4478 additions and 0 deletions
--- a/src/core/function.js
+++ b/src/core/function.js
@ -21,18 +21,30 @@ import {
  MathClamp,
  shadow,
  unreachable,
+  warn,
 } from "../shared/util.js";
 import { PostScriptLexer, PostScriptParser } from "./ps_parser.js";
 import { BaseStream } from "./base_stream.js";
+import { buildPostScriptWasmFunction } from "./postscript/wasm_compiler.js";
 import { isNumberArray } from "./core_utils.js";
 import { LocalFunctionCache } from "./image_utils.js";

 class PDFFunctionFactory {
+  static #useWasm = true;
+
+  static setOptions({ useWasm }) {
+    PDFFunctionFactory.#useWasm = useWasm;
+  }
+
  constructor({ xref, isEvalSupported = true }) {
    this.xref = xref;
    this.isEvalSupported = isEvalSupported !== false;
  }

+  get useWasm() {
+    return PDFFunctionFactory.#useWasm;
+  }
+
  create(fn, parseArray = false) {
    let fnRef, parsedFn;

@ -358,6 +370,24 @@ class PDFFunction {
      throw new FormatError("No range.");
    }

+    if (factory.useWasm) {
+      try {
+        const wasmFn = buildPostScriptWasmFunction(
+          fn.getString(),
+          domain,
+          range
+        );
+        if (wasmFn) {
+          return wasmFn; // (src, srcOffset, dest, destOffset) → void
+        }
+      } catch {
+        // Fall through to the existing interpreter-based path.
+      }
+    }
+
+    warn("Unable to compile PS function, using interpreter");
+    fn.reset();
+
    const lexer = new PostScriptLexer(fn);
    const parser = new PostScriptParser(lexer);
    const code = parser.parse();
--- a/src/core/pdf_manager.js
+++ b/src/core/pdf_manager.js
@ -28,6 +28,7 @@ import { JpxImage } from "./jpx.js";
 import { MissingDataException } from "./core_utils.js";
 import { OperatorList } from "./operator_list.js";
 import { PDFDocument } from "./document.js";
+import { PDFFunctionFactory } from "./function.js";
 import { Stream } from "./stream.js";

 function parseDocBaseUrl(url) {
@ -97,6 +98,7 @@ class BasePdfManager {
    IccColorSpace.setOptions(options);
    CmykICCBasedCS.setOptions(options);
    JBig2CCITTFaxWasmImage.setOptions(options);
+    PDFFunctionFactory.setOptions(options);
  }

  get docId() {
--- a/src/core/postscript/ast.js
+++ b/src/core/postscript/ast.js
--- a/src/core/postscript/lexer.js
+++ b/src/core/postscript/lexer.js
@ -0,0 +1,225 @@
+/* Copyright 2026 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+const TOKEN = {
+  // Structural tokens — not keyword operators
+  number: 0,
+  lbrace: 1,
+  rbrace: 2,
+
+  // Boolean literals
+  true: 3,
+  false: 4,
+
+  // Arithmetic binary operators
+  add: 5,
+  sub: 6,
+  mul: 7,
+  div: 8,
+  idiv: 9,
+  mod: 10,
+  exp: 11,
+
+  // Comparison binary operators
+  eq: 12,
+  ne: 13,
+  gt: 14,
+  ge: 15,
+  lt: 16,
+  le: 17,
+
+  // Bitwise / boolean binary operators
+  and: 18,
+  or: 19,
+  xor: 20,
+  bitshift: 21,
+
+  // Unary arithmetic operators
+  abs: 22,
+  neg: 23,
+  ceiling: 24,
+  floor: 25,
+  round: 26,
+  truncate: 27,
+
+  // Unary boolean / bitwise operator
+  not: 28,
+
+  // Mathematical functions — unary
+  sqrt: 29,
+  sin: 30,
+  cos: 31,
+  ln: 32,
+  log: 33,
+
+  // Mathematical function — binary
+  atan: 34,
+
+  // Type conversion operators
+  cvi: 35,
+  cvr: 36,
+
+  // Stack operators
+  dup: 37,
+  exch: 38,
+  pop: 39,
+  copy: 40,
+  index: 41,
+  roll: 42,
+
+  // Control flow
+  if: 43,
+  ifelse: 44,
+
+  // End of input
+  eof: 45,
+
+  // Synthetic: produced by the optimizer, never emitted by the lexer.
+  min: 46,
+  max: 47,
+};
+
+class Token {
+  constructor(id, value = null) {
+    this.id = id;
+    this.value = value;
+  }
+}
+
+class Lexer {
+  // Singletons for every non-number token, built lazily on first construction.
+  // Keyword operator tokens carry their name as `value`; structural tokens
+  // (lbrace, rbrace, eof) carry null.
+  static #singletons = null;
+
+  static #operatorSingletons = null;
+
+  static #initSingletons() {
+    const singletons = Object.create(null);
+    const operatorSingletons = Object.create(null);
+    for (const [name, id] of Object.entries(TOKEN)) {
+      if (name === "number") {
+        continue;
+      }
+      const isOperator = id >= TOKEN.true && id <= TOKEN.ifelse;
+      const token = new Token(id, isOperator ? name : null);
+      singletons[name] = token;
+      if (isOperator) {
+        operatorSingletons[name] = token;
+      }
+    }
+    Lexer.#singletons = singletons;
+    Lexer.#operatorSingletons = operatorSingletons;
+  }
+
+  constructor(data) {
+    if (!Lexer.#singletons) {
+      Lexer.#initSingletons();
+    }
+    this.data = data;
+    this.pos = 0;
+    this.len = data.length;
+    // Sticky regexes: set lastIndex before exec() to match at an exact offset.
+    this._numberPattern = /[+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?/y;
+    this._identifierPattern = /[a-z]+/y;
+  }
+
+  // Skip a % comment, advancing past the next \n or \r (or to EOF).
+  _skipComment() {
+    const lf = this.data.indexOf("\n", this.pos);
+    const cr = this.data.indexOf("\r", this.pos);
+    // Treat a missing EOL as this.len so Math.min picks the one that exists.
+    const eol = Math.min(lf < 0 ? this.len : lf, cr < 0 ? this.len : cr);
+    this.pos = Math.min(eol + 1, this.len);
+  }
+
+  _getNumber() {
+    this._numberPattern.lastIndex = this.pos;
+    const match = this._numberPattern.exec(this.data);
+    if (!match) {
+      return new Token(TOKEN.number, 0);
+    }
+    const number = parseFloat(match[0]);
+    if (!Number.isFinite(number)) {
+      return new Token(TOKEN.number, 0);
+    }
+    this.pos = this._numberPattern.lastIndex;
+    return new Token(TOKEN.number, number);
+  }
+
+  _getOperator() {
+    this._identifierPattern.lastIndex = this.pos;
+    const match = this._identifierPattern.exec(this.data);
+    if (!match) {
+      return new Token(TOKEN.number, 0);
+    }
+    this.pos = this._identifierPattern.lastIndex;
+    const op = match[0];
+    const token = Lexer.#operatorSingletons[op];
+    if (!token) {
+      return new Token(TOKEN.number, 0);
+    }
+    return token;
+  }
+
+  // Return the next token, or Lexer.#singletons.eof at end of input.
+  next() {
+    while (this.pos < this.len) {
+      const ch = this.data.charCodeAt(this.pos++);
+      switch (ch) {
+        // PostScript white-space characters (PDF32000 §7.2.2)
+        case 0x00 /* NUL */:
+        case 0x09 /* HT */:
+        case 0x0a /* LF */:
+        case 0x0c /* FF */:
+        case 0x0d /* CR */:
+        case 0x20 /* SP */:
+          break;
+
+        case 0x25 /* % — comment */:
+          this._skipComment();
+          break;
+
+        case 0x7b /* { */:
+          return Lexer.#singletons.lbrace;
+        case 0x7d /* } */:
+          return Lexer.#singletons.rbrace;
+
+        case 0x2b /* + */:
+        case 0x2d /* - */:
+          this.pos--;
+          return this._getNumber();
+
+        case 0x2e /* . */:
+          this.pos--;
+          return this._getNumber();
+
+        default:
+          if (ch >= 0x30 /* 0 */ && ch <= 0x39 /* 9 */) {
+            this.pos--;
+            return this._getNumber();
+          }
+          if (ch >= 0x61 /* a */ && ch <= 0x7a /* z */) {
+            this.pos--;
+            return this._getOperator();
+          }
+          return new Token(TOKEN.number, 0);
+      }
+    }
+    return Lexer.#singletons.eof;
+  }
+}
+
+export { Lexer, Token, TOKEN };
--- a/src/core/postscript/wasm_compiler.js
+++ b/src/core/postscript/wasm_compiler.js
--- a/test/unit/clitests.json
+++ b/test/unit/clitests.json
@ -44,6 +44,7 @@
    "pdf_spec.js",
    "pdf_viewer.component_spec.js",
    "pdf_viewer_spec.js",
+    "postscript_spec.js",
    "primitives_spec.js",
    "stream_spec.js",
    "struct_tree_spec.js",
--- a/test/unit/jasmine-boot.js
+++ b/test/unit/jasmine-boot.js
@ -87,6 +87,7 @@ async function initializePDFJS(callback) {
      "pdfjs-test/unit/pdf_spec.js",
      "pdfjs-test/unit/pdf_viewer.component_spec.js",
      "pdfjs-test/unit/pdf_viewer_spec.js",
+      "pdfjs-test/unit/postscript_spec.js",
      "pdfjs-test/unit/primitives_spec.js",
      "pdfjs-test/unit/scripting_spec.js",
      "pdfjs-test/unit/stream_spec.js",
--- a/test/unit/postscript_spec.js
+++ b/test/unit/postscript_spec.js