Add attachments when merging/reorganizing a pdf (bug 2026956)

2026-06-02 00:01:01 +02:00 · 2026-03-31 14:48:06 +02:00 · 2026-03-31 14:48:06 +02:00 · 5b8c04f383
commit 5b8c04f383
parent a40b91f0bb
3 changed files with 144 additions and 3 deletions
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@ -1084,6 +1084,15 @@ class Catalog {
    return shadow(this, "attachments", attachments);
  }

+  get rawEmbeddedFiles() {
+    const obj = this.#catDict.get("Names");
+    if (!(obj instanceof Dict) || !obj.has("EmbeddedFiles")) {
+      return null;
+    }
+    const nameTree = new NameTree(obj.getRaw("EmbeddedFiles"), this.xref);
+    return nameTree.getAll(/* isRaw = */ true);
+  }
+
  get xfaImages() {
    const obj = this.#catDict.get("Names");
    let xfaImages = null;
--- a/src/core/editor/pdf_editor.js
+++ b/src/core/editor/pdf_editor.js
@ -76,6 +76,7 @@ class DocumentData {
    this.hasSignatureAnnotations = false;
    this.fieldToParent = new RefSetCache();
    this.outline = null;
+    this.embeddedFiles = null;
  }
 }

@ -163,6 +164,8 @@ class PDFEditor {

  outlineItems = null;

+  embeddedFiles = new Map();
+
  constructor({ useObjectStreams = true, title = "", author = "" } = {}) {
    [this.rootRef, this.rootDict] = this.newDict;
    [this.infoRef, this.infoDict] = this.newDict;
@ -694,6 +697,7 @@ class PDFEditor {
    await this.#mergeStructTrees(allDocumentData);
    await this.#mergeAcroForms(allDocumentData);
    this.#buildOutline(allDocumentData);
+    await this.#collectEmbeddedFiles(allDocumentData);

    return this.writePDF();
  }
@ -723,6 +727,9 @@ class PDFEditor {
      pdfManager
        .ensureCatalog("documentOutlineForEditor")
        .then(outline => (documentData.outline = outline)),
+      pdfManager
+        .ensureCatalog("rawEmbeddedFiles")
+        .then(ef => (documentData.embeddedFiles = ef)),
    ]);
    const structTreeRoot = documentData.structTreeRoot;
    if (structTreeRoot) {
@ -2078,13 +2085,15 @@ class PDFEditor {
    const maxLeaves =
      MAX_IN_NAME_TREE_NODE <= 1 ? allEntries.length : MAX_IN_NAME_TREE_NODE;
    const [treeRef, treeDict] = this.newDict;
-    const stack = [{ dict: treeDict, entries: allEntries }];
+    const stack = [{ dict: treeDict, entries: allEntries, isRoot: true }];
    const valueType = areNames ? "Names" : "Nums";

    while (stack.length > 0) {
-      const { dict, entries } = stack.pop();
+      const { dict, entries, isRoot } = stack.pop();
      if (entries.length <= maxLeaves) {
-        dict.set("Limits", [entries[0][0], entries.at(-1)[0]]);
+        if (!isRoot) {
+          dict.set("Limits", [entries[0][0], entries.at(-1)[0]]);
+        }
        dict.set(valueType, entries.flat());
        continue;
      }
@ -2124,6 +2133,63 @@ class PDFEditor {
    rootDict.set("PageLabels", pageLabelsRef);
  }

+  /**
+   * Collect and clone EmbeddedFiles from all source documents.
+   * @param {Array<DocumentData>} allDocumentData
+   */
+  async #collectEmbeddedFiles(allDocumentData) {
+    const { embeddedFiles } = this;
+    for (const documentData of allDocumentData) {
+      const {
+        embeddedFiles: docEmbeddedFiles,
+        document: { xref },
+      } = documentData;
+      if (!docEmbeddedFiles?.size) {
+        continue;
+      }
+      this.currentDocument = documentData;
+      for (const [key, valueRef] of docEmbeddedFiles) {
+        let name = key;
+        if (embeddedFiles.has(name)) {
+          const displayName = stringToPDFString(
+            key,
+            /* keepEscapeSequence = */ true
+          );
+          for (let i = 1; ; i++) {
+            const deduped = `${displayName}_${i}`;
+            if (!embeddedFiles.has(deduped)) {
+              name = deduped;
+              break;
+            }
+          }
+        }
+        embeddedFiles.set(
+          name,
+          await this.#collectDependencies(valueRef, true, xref)
+        );
+      }
+      this.currentDocument = null;
+    }
+  }
+
+  #makeEmbeddedFilesTree() {
+    const { embeddedFiles } = this;
+    if (embeddedFiles.size === 0) {
+      return;
+    }
+    if (!this.namesDict) {
+      [this.namesRef, this.namesDict] = this.newDict;
+      this.rootDict.set("Names", this.namesRef);
+    }
+    this.namesDict.set(
+      "EmbeddedFiles",
+      this.#makeNameNumTree(
+        Array.from(embeddedFiles.entries()),
+        /* areNames = */ true
+      )
+    );
+  }
+
  #makeDestinationsTree() {
    const { namedDestinations } = this;
    if (namedDestinations.size === 0) {
@ -2245,6 +2311,7 @@ class PDFEditor {
    this.#makeAcroForm();
    this.#makePageTree();
    this.#makePageLabelsTree();
+    this.#makeEmbeddedFilesTree();
    this.#makeDestinationsTree();
    this.#makeStructTree();
    await this.#makeOutline();
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -6429,6 +6429,71 @@ small scripts as well as for`);

        await loadingTask.destroy();
      });
+
+      it("preserves EmbeddedFiles (attachments) when extracting pages", async function () {
+        let loadingTask = getDocument(buildGetDocumentParams("attachment.pdf"));
+        let pdfDoc = await loadingTask.promise;
+
+        // Verify the original document has the expected attachment.
+        const originalAttachments = await pdfDoc.getAttachments();
+        expect(originalAttachments["foo.txt"]).toBeDefined();
+
+        const data = await pdfDoc.extractPages([
+          { document: null, includePages: [0] },
+        ]);
+        await loadingTask.destroy();
+
+        loadingTask = getDocument(data);
+        pdfDoc = await loadingTask.promise;
+
+        const attachments = await pdfDoc.getAttachments();
+        expect(attachments).not.toBeNull();
+        expect(attachments["foo.txt"]).toEqual({
+          rawFilename: "foo.txt",
+          filename: "foo.txt",
+          content: new Uint8Array([98, 97, 114, 32, 98, 97, 122, 32, 10]),
+          description: "",
+        });
+
+        await loadingTask.destroy();
+      });
+
+      it("preserves EmbeddedFiles (attachments) when merging two PDFs", async function () {
+        let loadingTask = getDocument(buildGetDocumentParams("attachment.pdf"));
+        let pdfDoc = await loadingTask.promise;
+
+        // Merge attachment.pdf with itself: both copies carry "foo.txt", so
+        // the second one should be deduplicated to "foo.txt_1".
+        const data = await pdfDoc.extractPages([
+          { document: null },
+          { document: null },
+        ]);
+        await loadingTask.destroy();
+
+        loadingTask = getDocument(data);
+        pdfDoc = await loadingTask.promise;
+
+        const attachments = await pdfDoc.getAttachments();
+        expect(attachments).not.toBeNull();
+
+        const expectedContent = new Uint8Array([
+          98, 97, 114, 32, 98, 97, 122, 32, 10,
+        ]);
+        expect(attachments["foo.txt"]).toEqual({
+          rawFilename: "foo.txt",
+          filename: "foo.txt",
+          content: expectedContent,
+          description: "",
+        });
+        expect(attachments["foo.txt_1"]).toEqual({
+          rawFilename: "foo.txt",
+          filename: "foo.txt",
+          content: expectedContent,
+          description: "",
+        });
+
+        await loadingTask.destroy();
+      });
    });

    describe("AcroForm", function () {