From 5b8c04f383b788d650db445b9bbdd10915752bcb Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Tue, 31 Mar 2026 14:48:06 +0200 Subject: [PATCH] Add attachments when merging/reorganizing a pdf (bug 2026956) --- src/core/catalog.js | 9 +++++ src/core/editor/pdf_editor.js | 73 +++++++++++++++++++++++++++++++++-- test/unit/api_spec.js | 65 +++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 3 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 8b6e30f33..da4502b49 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1084,6 +1084,15 @@ class Catalog { return shadow(this, "attachments", attachments); } + get rawEmbeddedFiles() { + const obj = this.#catDict.get("Names"); + if (!(obj instanceof Dict) || !obj.has("EmbeddedFiles")) { + return null; + } + const nameTree = new NameTree(obj.getRaw("EmbeddedFiles"), this.xref); + return nameTree.getAll(/* isRaw = */ true); + } + get xfaImages() { const obj = this.#catDict.get("Names"); let xfaImages = null; diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 615010c9a..585c2eaf4 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -76,6 +76,7 @@ class DocumentData { this.hasSignatureAnnotations = false; this.fieldToParent = new RefSetCache(); this.outline = null; + this.embeddedFiles = null; } } @@ -163,6 +164,8 @@ class PDFEditor { outlineItems = null; + embeddedFiles = new Map(); + constructor({ useObjectStreams = true, title = "", author = "" } = {}) { [this.rootRef, this.rootDict] = this.newDict; [this.infoRef, this.infoDict] = this.newDict; @@ -694,6 +697,7 @@ class PDFEditor { await this.#mergeStructTrees(allDocumentData); await this.#mergeAcroForms(allDocumentData); this.#buildOutline(allDocumentData); + await this.#collectEmbeddedFiles(allDocumentData); return this.writePDF(); } @@ -723,6 +727,9 @@ class PDFEditor { pdfManager .ensureCatalog("documentOutlineForEditor") .then(outline => (documentData.outline = outline)), + pdfManager + .ensureCatalog("rawEmbeddedFiles") + .then(ef => (documentData.embeddedFiles = ef)), ]); const structTreeRoot = documentData.structTreeRoot; if (structTreeRoot) { @@ -2078,13 +2085,15 @@ class PDFEditor { const maxLeaves = MAX_IN_NAME_TREE_NODE <= 1 ? allEntries.length : MAX_IN_NAME_TREE_NODE; const [treeRef, treeDict] = this.newDict; - const stack = [{ dict: treeDict, entries: allEntries }]; + const stack = [{ dict: treeDict, entries: allEntries, isRoot: true }]; const valueType = areNames ? "Names" : "Nums"; while (stack.length > 0) { - const { dict, entries } = stack.pop(); + const { dict, entries, isRoot } = stack.pop(); if (entries.length <= maxLeaves) { - dict.set("Limits", [entries[0][0], entries.at(-1)[0]]); + if (!isRoot) { + dict.set("Limits", [entries[0][0], entries.at(-1)[0]]); + } dict.set(valueType, entries.flat()); continue; } @@ -2124,6 +2133,63 @@ class PDFEditor { rootDict.set("PageLabels", pageLabelsRef); } + /** + * Collect and clone EmbeddedFiles from all source documents. + * @param {Array} allDocumentData + */ + async #collectEmbeddedFiles(allDocumentData) { + const { embeddedFiles } = this; + for (const documentData of allDocumentData) { + const { + embeddedFiles: docEmbeddedFiles, + document: { xref }, + } = documentData; + if (!docEmbeddedFiles?.size) { + continue; + } + this.currentDocument = documentData; + for (const [key, valueRef] of docEmbeddedFiles) { + let name = key; + if (embeddedFiles.has(name)) { + const displayName = stringToPDFString( + key, + /* keepEscapeSequence = */ true + ); + for (let i = 1; ; i++) { + const deduped = `${displayName}_${i}`; + if (!embeddedFiles.has(deduped)) { + name = deduped; + break; + } + } + } + embeddedFiles.set( + name, + await this.#collectDependencies(valueRef, true, xref) + ); + } + this.currentDocument = null; + } + } + + #makeEmbeddedFilesTree() { + const { embeddedFiles } = this; + if (embeddedFiles.size === 0) { + return; + } + if (!this.namesDict) { + [this.namesRef, this.namesDict] = this.newDict; + this.rootDict.set("Names", this.namesRef); + } + this.namesDict.set( + "EmbeddedFiles", + this.#makeNameNumTree( + Array.from(embeddedFiles.entries()), + /* areNames = */ true + ) + ); + } + #makeDestinationsTree() { const { namedDestinations } = this; if (namedDestinations.size === 0) { @@ -2245,6 +2311,7 @@ class PDFEditor { this.#makeAcroForm(); this.#makePageTree(); this.#makePageLabelsTree(); + this.#makeEmbeddedFilesTree(); this.#makeDestinationsTree(); this.#makeStructTree(); await this.#makeOutline(); diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index f5148781b..217bd677d 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -6429,6 +6429,71 @@ small scripts as well as for`); await loadingTask.destroy(); }); + + it("preserves EmbeddedFiles (attachments) when extracting pages", async function () { + let loadingTask = getDocument(buildGetDocumentParams("attachment.pdf")); + let pdfDoc = await loadingTask.promise; + + // Verify the original document has the expected attachment. + const originalAttachments = await pdfDoc.getAttachments(); + expect(originalAttachments["foo.txt"]).toBeDefined(); + + const data = await pdfDoc.extractPages([ + { document: null, includePages: [0] }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + const attachments = await pdfDoc.getAttachments(); + expect(attachments).not.toBeNull(); + expect(attachments["foo.txt"]).toEqual({ + rawFilename: "foo.txt", + filename: "foo.txt", + content: new Uint8Array([98, 97, 114, 32, 98, 97, 122, 32, 10]), + description: "", + }); + + await loadingTask.destroy(); + }); + + it("preserves EmbeddedFiles (attachments) when merging two PDFs", async function () { + let loadingTask = getDocument(buildGetDocumentParams("attachment.pdf")); + let pdfDoc = await loadingTask.promise; + + // Merge attachment.pdf with itself: both copies carry "foo.txt", so + // the second one should be deduplicated to "foo.txt_1". + const data = await pdfDoc.extractPages([ + { document: null }, + { document: null }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + const attachments = await pdfDoc.getAttachments(); + expect(attachments).not.toBeNull(); + + const expectedContent = new Uint8Array([ + 98, 97, 114, 32, 98, 97, 122, 32, 10, + ]); + expect(attachments["foo.txt"]).toEqual({ + rawFilename: "foo.txt", + filename: "foo.txt", + content: expectedContent, + description: "", + }); + expect(attachments["foo.txt_1"]).toEqual({ + rawFilename: "foo.txt", + filename: "foo.txt", + content: expectedContent, + description: "", + }); + + await loadingTask.destroy(); + }); }); describe("AcroForm", function () {