Merge pull request #21465 from calixteman/fix_attachments

Re-derive annotation attachment content from the xref after cleanup
2026-06-23 00:15:51 +02:00 · 2026-06-18 16:48:13 +02:00 · 2026-06-18 16:48:13 +02:00 · 3956ac1b39
commit 3956ac1b39
parent 187c22126a 9d9fb06d7f
5 changed files with 239 additions and 118 deletions
--- a/src/core/annotation.js
+++ b/src/core/annotation.js
@ -5408,33 +5408,28 @@ class FileAttachmentAnnotation extends MarkupAnnotation {
    super(params);

    const { annotationGlobals, dict } = params;
-    const fileSpecRef = dict.getRaw("FS");
    const fsDict = dict.get("FS");
    const file = new FileSpec(fsDict);
    /** @type {{catalog?: Catalog}} */
    const { catalog } = annotationGlobals.pdfManager.pdfDocument;

-    // When this annotation references an embedded file that’s already in the
-    // catalog `NameTree` (such as `EFOpen`), reuse that `NameTree` id so the
-    // sidebar and annotation paths resolve the same attachment identity.
-    let fileId =
-      fileSpecRef instanceof Ref
-        ? catalog?.attachmentIdByRef.get(fileSpecRef)
-        : undefined;
-
-    // Fallback ids are namespaced to keep annotation-local ids distinct from
-    // `NameTree` ids (which are filename-based).
-    if (catalog && fsDict instanceof Dict && typeof fileId !== "string") {
-      const baseFileId = `annotation:${this.data.id}`;
-      fileId = baseFileId;
-
-      let i = 1;
-      while (catalog.attachmentDictById.has(fileId)) {
-        fileId = `${baseFileId}-${i++}`;
+    // Encode the embedded content's reference in the id so it can be
+    // re-fetched from the xref on demand (see `Catalog.attachmentContent`)
+    // instead of being cached where `cleanup` would wipe it. The file-spec is
+    // usually indirect; when it's inline its embedded-file stream still isn't
+    // (streams are always indirect), so fall back to that ref.
+    let fileId;
+    if (fsDict instanceof Dict) {
+      let contentRef = dict.getRaw("FS");
+      if (!(contentRef instanceof Ref)) {
+        contentRef = FileSpec.pickPlatformItem(
+          fsDict.get("EF"),
+          /* raw = */ true
+        );
+      }
+      if (contentRef instanceof Ref) {
+        fileId = catalog?.getAttachmentIdForAnnotation(contentRef);
      }
-
-      // Cache only fallbacks.
-      catalog.attachmentDictById.set(fileId, fsDict);
    }

    this.data.hasOwnCanvas = this.data.noRotate;
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@ -119,18 +119,12 @@ function fetchRemoteDest(action) {
 class Catalog {
  #actualNumPages = null;

-  /** @type {RefSetCache | null} */
-  #attachmentIdByRef = null;
+  #annotationAttachmentIdByRef = new RefSetCache();
+
+  #annotationAttachmentRefById = new Map();

  #catDict = null;

-  /**
-   * Attachment dictionaries keyed by attachment id.
-   *
-   * @type {Map<string, Dict>}
-   */
-  attachmentDictById = new Map();
-
  builtInCMapCache = new Map();

  fontCache = new RefSetCache();
@ -164,33 +158,44 @@ class Catalog {
    this.toplevelPagesDict; // eslint-disable-line no-unused-expressions
  }

-  /**
-   * Attachment ids keyed by embedded-file reference.
-   *
-   * @type {RefSetCache}
-   */
-  get attachmentIdByRef() {
-    if (this.#attachmentIdByRef) {
-      return this.#attachmentIdByRef;
-    }
-
-    const attachmentIdByRef = new RefSetCache();
-    for (const [name, ref] of this.rawEmbeddedFiles || []) {
-      if (!(ref instanceof Ref)) {
-        continue;
-      }
-      attachmentIdByRef.put(
-        ref,
-        stringToPDFString(name, /* keepEscapeSequence = */ true)
-      );
-    }
-    return (this.#attachmentIdByRef = attachmentIdByRef);
-  }
-
  cloneDict() {
    return this.#catDict.clone();
  }

+  /**
+   * Create an id for an attachment from a FileAttachment annotation.
+   *
+   * The id is registered here rather than parsed from a public string prefix in
+   * `attachmentContent`, since catalog attachment names can be arbitrary PDF
+   * strings and may otherwise collide with annotation-local ids.
+   *
+   * @param {Ref} ref
+   *   File-spec or embedded-file stream reference.
+   * @returns {string}
+   *   Attachment id.
+   */
+  getAttachmentIdForAnnotation(ref) {
+    let id = this.#annotationAttachmentIdByRef.get(ref);
+    if (id) {
+      return id;
+    }
+
+    const baseId = `attachmentRef:${ref.toString()}`;
+    id = baseId;
+
+    let i = 1;
+    while (
+      this.#annotationAttachmentRefById.has(id) ||
+      this.attachments?.has(id)
+    ) {
+      id = `${baseId}-${i++}`;
+    }
+
+    this.#annotationAttachmentIdByRef.put(ref, id);
+    this.#annotationAttachmentRefById.set(id, ref);
+    return id;
+  }
+
  get version() {
    const version = this.#catDict.get("Version");
    if (version instanceof Name) {
@ -1157,19 +1162,12 @@ class Catalog {
  }

  /**
-   * Get content for an attachment.
-   *
   * @param {string} id
-   *   Unique attachment identifier (required).
-   * @returns {CatalogAttachmentContent}
-   *   Content.
+   *   Unique attachment identifier.
+   * @returns {CatalogAttachmentContent | undefined}
+   *   Content, or `undefined` when no named attachment exists for the id.
   */
-  attachmentContent(id) {
-    const dict = this.attachmentDictById.get(id);
-    if (dict) {
-      return FileSpec.readContent(dict);
-    }
-
+  #attachmentContentByName(id) {
    const obj = this.#catDict.get("Names");
    if (obj instanceof Dict && obj.has("EmbeddedFiles")) {
      const nameTree = new NameTree(obj.getRaw("EmbeddedFiles"), this.xref);
@ -1179,6 +1177,36 @@ class Catalog {
        }
      }
    }
+    return undefined;
+  }
+
+  /**
+   * Get content for an attachment.
+   *
+   * @param {string} id
+   *   Unique attachment identifier (required).
+   * @returns {CatalogAttachmentContent}
+   *   Content.
+   */
+  attachmentContent(id) {
+    const namedContent = this.#attachmentContentByName(id);
+    if (namedContent !== undefined) {
+      return namedContent;
+    }
+
+    // Annotation-local attachments register the reference of their embedded
+    // content in the catalog, so it's re-fetched from the xref on demand
+    // instead of being cached (which would then need to survive `cleanup`).
+    // The reference points either at the file-spec dictionary or, for an inline
+    // file-spec, straight at the embedded-file stream.
+    const ref = this.#annotationAttachmentRefById.get(id);
+    if (ref) {
+      const target = this.xref.fetch(ref);
+      if (target instanceof BaseStream) {
+        return FileSpec.readStreamContent(target);
+      }
+      return target instanceof Dict ? FileSpec.readContent(target) : null;
+    }
    return null;
  }

@ -1280,9 +1308,6 @@ class Catalog {

  async cleanup(manuallyTriggered = false) {
    clearGlobalCaches();
-    this.#attachmentIdByRef?.clear();
-    this.#attachmentIdByRef = null;
-    this.attachmentDictById.clear();
    this.globalColorSpaceCache.clear();
    this.globalImageCache.clear(/* onlyData = */ manuallyTriggered);
    this.pageKidsCountCache.clear();
--- a/src/core/file_spec.js
+++ b/src/core/file_spec.js
@ -27,29 +27,6 @@ import { stringToPDFString } from "./string_utils.js";
 * @import { CatalogAttachmentContent } from "./catalog.js";
 */

-/**
- * Get a platform-specific item from a file-spec dictionary.
- *
- * Search order follows the PDF platform keys: `UF`, `F`, `Unix`, `Mac`,
- * `DOS`.
- *
- * @param {Dict | null | undefined} dict
- *   Dictionary.
- * @returns {unknown}
- *   Matching dictionary value or `null` when no key is found.
- */
-function pickPlatformItem(dict) {
-  if (dict instanceof Dict) {
-    // Look for the filename in this order: UF, F, Unix, Mac, DOS
-    for (const key of ["UF", "F", "Unix", "Mac", "DOS"]) {
-      if (dict.has(key)) {
-        return dict.get(key);
-      }
-    }
-  }
-  return null;
-}
-
 /**
 * "A PDF file can refer to the contents of another file by using a File
 * Specification (PDF 1.1)", see the spec (7.11) for more details.
@ -76,7 +53,7 @@ class FileSpec {
  }

  get filename() {
-    const item = pickPlatformItem(this.root);
+    const item = FileSpec.pickPlatformItem(this.root);
    if (item && typeof item === "string") {
      // NOTE: The following replacement order is INTENTIONAL, regardless of
      //       what some static code analysers (e.g. CodeQL) may claim.
@ -105,6 +82,31 @@ class FileSpec {
    };
  }

+  /**
+   * Get a platform-specific item from a file-spec dictionary.
+   *
+   * Search order follows the PDF platform keys: `UF`, `F`, `Unix`, `Mac`,
+   * `DOS`.
+   *
+   * @param {Dict | null | undefined} dict
+   *   Dictionary.
+   * @param {boolean} [raw]
+   *   Return the raw (possibly indirect) value rather than the resolved one.
+   * @returns {unknown}
+   *   Matching dictionary value or `null` when no key is found.
+   */
+  static pickPlatformItem(dict, raw = false) {
+    if (dict instanceof Dict) {
+      // Look for the filename in this order: UF, F, Unix, Mac, DOS
+      for (const key of ["UF", "F", "Unix", "Mac", "DOS"]) {
+        if (dict.has(key)) {
+          return raw ? dict.getRaw(key) : dict.get(key);
+        }
+      }
+    }
+    return null;
+  }
+
  /**
   * Read attachment bytes from a file-spec dictionary.
   *
@ -119,24 +121,36 @@ class FileSpec {
    if (!(dict instanceof Dict)) {
      return null;
    }
-    const ef = pickPlatformItem(dict.get("EF"));
+    const ef = this.pickPlatformItem(dict.get("EF"));
    if (!(ef instanceof BaseStream)) {
      warn(
        "Embedded file specification points to non-existing/invalid content"
      );
      return null;
    }
+    return this.readStreamContent(ef);
+  }

+  /**
+   * Read the bytes of an embedded-file stream.
+   *
+   * @param {BaseStream} stream
+   *   Embedded-file stream.
+   * @returns {CatalogAttachmentContent}
+   *   Attachment bytes.
+   * @throws {PasswordException}
+   *   When the bytes are encrypted and no key is available.
+   */
+  static readStreamContent(stream) {
    // Throw if we need a password but don’t have one.
-    const encrypt = dict.xref?.encrypt;
+    const encrypt = stream.dict?.xref?.encrypt;
    if (encrypt?.encryptionKey === null) {
      throw new PasswordException(
        "No password given",
        PasswordResponses.NEED_PASSWORD
      );
    }
-
-    return ef.getBytes();
+    return stream.getBytes();
  }
 }

--- a/test/unit/annotation_spec.js
+++ b/test/unit/annotation_spec.js
@ -26,6 +26,7 @@ import {
  AnnotationFieldFlag,
  AnnotationFlag,
  AnnotationType,
+  bytesToString,
  DrawOPS,
  OPS,
  RenderingIntentFlag,
@ -41,6 +42,7 @@ import {
 } from "./test_utils.js";
 import { Dict, Name, Ref, RefSetCache } from "../../src/core/primitives.js";
 import { Lexer, Parser } from "../../src/core/parser.js";
+import { Catalog } from "../../src/core/catalog.js";
 import { FlateStream } from "../../src/core/flate_stream.js";
 import { PartialEvaluator } from "../../src/core/evaluator.js";
 import { StringStream } from "../../src/core/stream.js";
@ -52,9 +54,10 @@ describe("annotation", function () {
    constructor(params) {
      this.pdfDocument = {
        catalog: {
-          attachmentDictById: new Map(),
-          attachmentIdByRef: new RefSetCache(),
          baseUrl: params.docBaseUrl || null,
+          getAttachmentIdForAnnotation(ref) {
+            return `attachmentRef:${ref.toString()}`;
+          },
        },
      };
      this.evaluatorOptions = {
@ -4403,20 +4406,17 @@ describe("annotation", function () {
        idFactoryMock
      );
      expect(data.annotationType).toEqual(AnnotationType.FILEATTACHMENT);
-      expect(data.fileId.startsWith("annotation:")).toEqual(true);
+      // The file-spec is an indirect object, so its reference is encoded in the
+      // id and re-fetched on demand.
+      expect(data.fileId).toEqual("attachmentRef:19R");
      expect(data.file).toEqual({
        rawFilename: "Test.txt",
        filename: "Test.txt",
        description: "abc",
      });
-
-      // Content lookup and reading requires a bigger mock than used here.
-      expect(
-        pdfManagerMock.pdfDocument.catalog.attachmentDictById.has(data.fileId)
-      ).toEqual(true);
    });

-    it("should reuse the attachment NameTree id for referenced files", async function () {
+    it("should re-derive an inline file attachment from its embedded stream", async function () {
      const fileStream = new StringStream(
        "<<\n" +
          "/Type /EmbeddedFile\n" +
@ -4432,41 +4432,36 @@ describe("annotation", function () {
        allowStreams: true,
      });

-      const fileStreamRef = Ref.get(28, 0);
+      const fileStreamRef = Ref.get(18, 0);
      const fileStreamDict = parser.getObj();

      const embeddedFileDict = new Dict();
      embeddedFileDict.set("F", fileStreamRef);

-      const fileSpecRef = Ref.get(29, 0);
+      // The file-spec is inline (not an indirect object), so the embedded-file
+      // stream's reference is encoded in the id instead.
      const fileSpecDict = new Dict();
      fileSpecDict.set("Type", Name.get("Filespec"));
      fileSpecDict.set("Desc", "abc");
      fileSpecDict.set("EF", embeddedFileDict);
      fileSpecDict.set("UF", "Test.txt");

-      const fileAttachmentRef = Ref.get(30, 0);
+      const fileAttachmentRef = Ref.get(20, 0);
      const fileAttachmentDict = new Dict();
      fileAttachmentDict.set("Type", Name.get("Annot"));
      fileAttachmentDict.set("Subtype", Name.get("FileAttachment"));
-      fileAttachmentDict.set("FS", fileSpecRef);
+      fileAttachmentDict.set("FS", fileSpecDict);
      fileAttachmentDict.set("T", "Topic");
      fileAttachmentDict.set("Contents", "Test.txt");

      const xref = new XRefMock([
        { ref: fileStreamRef, data: fileStreamDict },
-        { ref: fileSpecRef, data: fileSpecDict },
        { ref: fileAttachmentRef, data: fileAttachmentDict },
      ]);
      embeddedFileDict.assignXref(xref);
      fileSpecDict.assignXref(xref);
      fileAttachmentDict.assignXref(xref);

-      pdfManagerMock.pdfDocument.catalog.attachmentIdByRef.put(
-        fileSpecRef,
-        "Test.txt"
-      );
-
      const { data } = await AnnotationFactory.create(
        xref,
        fileAttachmentRef,
@ -4474,17 +4469,81 @@ describe("annotation", function () {
        idFactoryMock
      );
      expect(data.annotationType).toEqual(AnnotationType.FILEATTACHMENT);
-      expect(data.fileId).toEqual("Test.txt");
+      expect(data.fileId).toEqual("attachmentRef:18R");
      expect(data.file).toEqual({
        rawFilename: "Test.txt",
        filename: "Test.txt",
        description: "abc",
      });
+    });

-      // File should not be added as it’s already referenced in the `NameTree`.
+    it("should keep named attachment ids distinct from annotation attachment ids", function () {
+      const annotationStreamRef = Ref.get(18, 0);
+      const annotationStreamDict = new Dict();
+      annotationStreamDict.set("Type", Name.get("EmbeddedFile"));
+      const annotationStream = new StringStream(
+        "Annotation attachment",
+        annotationStreamDict
+      );
+
+      const namedStreamRef = Ref.get(21, 0);
+      const namedStreamDict = new Dict();
+      namedStreamDict.set("Type", Name.get("EmbeddedFile"));
+      const namedStream = new StringStream("Named attachment", namedStreamDict);
+
+      const namedEmbeddedFileDict = new Dict();
+      namedEmbeddedFileDict.set("F", namedStreamRef);
+
+      const namedFileSpecRef = Ref.get(22, 0);
+      const namedFileSpecDict = new Dict();
+      namedFileSpecDict.set("Type", Name.get("Filespec"));
+      namedFileSpecDict.set("EF", namedEmbeddedFileDict);
+      namedFileSpecDict.set("F", "Named.txt");
+
+      const pagesDict = new Dict();
+      const embeddedFilesDict = new Dict();
+      embeddedFilesDict.set("Names", ["attachmentRef:18R", namedFileSpecRef]);
+
+      const namesDict = new Dict();
+      namesDict.set("EmbeddedFiles", embeddedFilesDict);
+
+      const catalogDict = new Dict();
+      catalogDict.set("Pages", pagesDict);
+      catalogDict.set("Names", namesDict);
+
+      const xref = new XRefMock([
+        { ref: annotationStreamRef, data: annotationStream },
+        { ref: namedStreamRef, data: namedStream },
+        { ref: namedFileSpecRef, data: namedFileSpecDict },
+      ]);
+      xref.getCatalogObj = () => catalogDict;
+
+      for (const dict of [
+        annotationStreamDict,
+        namedStreamDict,
+        namedEmbeddedFileDict,
+        namedFileSpecDict,
+        pagesDict,
+        embeddedFilesDict,
+        namesDict,
+        catalogDict,
+      ]) {
+        dict.assignXref(xref);
+      }
+
+      const catalog = new Catalog(pdfManagerMock, xref);
+      const annotationId =
+        catalog.getAttachmentIdForAnnotation(annotationStreamRef);
+
+      expect(annotationId).toEqual("attachmentRef:18R-1");
      expect(
-        pdfManagerMock.pdfDocument.catalog.attachmentDictById.has(data.fileId)
-      ).toEqual(false);
+        bytesToString(catalog.attachmentContent("attachmentRef:18R"))
+      ).toEqual("Named attachment");
+      expect(bytesToString(catalog.attachmentContent(annotationId))).toEqual(
+        "Annotation attachment"
+      );
+      // An unknown id resolves to no content.
+      expect(catalog.attachmentContent("nonexistent")).toEqual(null);
    });
  });

--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -3915,6 +3915,34 @@ describe("api", function () {
      await loadingTask.destroy();
    });

+    it("gets FileAttachment annotation content that stays readable after cleanup", async function () {
+      // The embedded files are reachable only via the annotations (no catalog
+      // `/Names` tree), so their content must survive `cleanup` by being
+      // re-derivable from the xref.
+      const loadingTask = getDocument(buildGetDocumentParams("bug1230933.pdf"));
+      const pdfDoc = await loadingTask.promise;
+      const pdfPage = await pdfDoc.getPage(1);
+      const annotations = await pdfPage.getAnnotations();
+
+      const fileAnnotation = annotations.find(
+        a => a.annotationType === AnnotationType.FILEATTACHMENT
+      );
+      const { fileId } = fileAnnotation;
+      expect(fileId.startsWith("attachmentRef:")).toEqual(true);
+
+      const before = await pdfDoc.getAttachmentContent(fileId);
+      expect(before).toBeInstanceOf(Uint8Array);
+      expect(before.length).toEqual(234414);
+
+      await pdfDoc.cleanup();
+
+      const after = await pdfDoc.getAttachmentContent(fileId);
+      expect(after).toBeInstanceOf(Uint8Array);
+      expect(after.length).toEqual(234414);
+
+      await loadingTask.destroy();
+    });
+
    it("gets annotations containing /Launch action with /FileSpec dictionary (issue 17846)", async function () {
      const loadingTask = getDocument(buildGetDocumentParams("issue17846.pdf"));
      const pdfDoc = await loadingTask.promise;