Add attachments when merging/reorganizing a pdf (bug 2026956)

This commit is contained in:
Calixte Denizet 2026-03-31 14:48:06 +02:00
parent a40b91f0bb
commit 5b8c04f383
3 changed files with 144 additions and 3 deletions

View File

@ -1084,6 +1084,15 @@ class Catalog {
return shadow(this, "attachments", attachments);
}
get rawEmbeddedFiles() {
const obj = this.#catDict.get("Names");
if (!(obj instanceof Dict) || !obj.has("EmbeddedFiles")) {
return null;
}
const nameTree = new NameTree(obj.getRaw("EmbeddedFiles"), this.xref);
return nameTree.getAll(/* isRaw = */ true);
}
get xfaImages() {
const obj = this.#catDict.get("Names");
let xfaImages = null;

View File

@ -76,6 +76,7 @@ class DocumentData {
this.hasSignatureAnnotations = false;
this.fieldToParent = new RefSetCache();
this.outline = null;
this.embeddedFiles = null;
}
}
@ -163,6 +164,8 @@ class PDFEditor {
outlineItems = null;
embeddedFiles = new Map();
constructor({ useObjectStreams = true, title = "", author = "" } = {}) {
[this.rootRef, this.rootDict] = this.newDict;
[this.infoRef, this.infoDict] = this.newDict;
@ -694,6 +697,7 @@ class PDFEditor {
await this.#mergeStructTrees(allDocumentData);
await this.#mergeAcroForms(allDocumentData);
this.#buildOutline(allDocumentData);
await this.#collectEmbeddedFiles(allDocumentData);
return this.writePDF();
}
@ -723,6 +727,9 @@ class PDFEditor {
pdfManager
.ensureCatalog("documentOutlineForEditor")
.then(outline => (documentData.outline = outline)),
pdfManager
.ensureCatalog("rawEmbeddedFiles")
.then(ef => (documentData.embeddedFiles = ef)),
]);
const structTreeRoot = documentData.structTreeRoot;
if (structTreeRoot) {
@ -2078,13 +2085,15 @@ class PDFEditor {
const maxLeaves =
MAX_IN_NAME_TREE_NODE <= 1 ? allEntries.length : MAX_IN_NAME_TREE_NODE;
const [treeRef, treeDict] = this.newDict;
const stack = [{ dict: treeDict, entries: allEntries }];
const stack = [{ dict: treeDict, entries: allEntries, isRoot: true }];
const valueType = areNames ? "Names" : "Nums";
while (stack.length > 0) {
const { dict, entries } = stack.pop();
const { dict, entries, isRoot } = stack.pop();
if (entries.length <= maxLeaves) {
dict.set("Limits", [entries[0][0], entries.at(-1)[0]]);
if (!isRoot) {
dict.set("Limits", [entries[0][0], entries.at(-1)[0]]);
}
dict.set(valueType, entries.flat());
continue;
}
@ -2124,6 +2133,63 @@ class PDFEditor {
rootDict.set("PageLabels", pageLabelsRef);
}
/**
* Collect and clone EmbeddedFiles from all source documents.
* @param {Array<DocumentData>} allDocumentData
*/
async #collectEmbeddedFiles(allDocumentData) {
const { embeddedFiles } = this;
for (const documentData of allDocumentData) {
const {
embeddedFiles: docEmbeddedFiles,
document: { xref },
} = documentData;
if (!docEmbeddedFiles?.size) {
continue;
}
this.currentDocument = documentData;
for (const [key, valueRef] of docEmbeddedFiles) {
let name = key;
if (embeddedFiles.has(name)) {
const displayName = stringToPDFString(
key,
/* keepEscapeSequence = */ true
);
for (let i = 1; ; i++) {
const deduped = `${displayName}_${i}`;
if (!embeddedFiles.has(deduped)) {
name = deduped;
break;
}
}
}
embeddedFiles.set(
name,
await this.#collectDependencies(valueRef, true, xref)
);
}
this.currentDocument = null;
}
}
#makeEmbeddedFilesTree() {
const { embeddedFiles } = this;
if (embeddedFiles.size === 0) {
return;
}
if (!this.namesDict) {
[this.namesRef, this.namesDict] = this.newDict;
this.rootDict.set("Names", this.namesRef);
}
this.namesDict.set(
"EmbeddedFiles",
this.#makeNameNumTree(
Array.from(embeddedFiles.entries()),
/* areNames = */ true
)
);
}
#makeDestinationsTree() {
const { namedDestinations } = this;
if (namedDestinations.size === 0) {
@ -2245,6 +2311,7 @@ class PDFEditor {
this.#makeAcroForm();
this.#makePageTree();
this.#makePageLabelsTree();
this.#makeEmbeddedFilesTree();
this.#makeDestinationsTree();
this.#makeStructTree();
await this.#makeOutline();

View File

@ -6429,6 +6429,71 @@ small scripts as well as for`);
await loadingTask.destroy();
});
it("preserves EmbeddedFiles (attachments) when extracting pages", async function () {
let loadingTask = getDocument(buildGetDocumentParams("attachment.pdf"));
let pdfDoc = await loadingTask.promise;
// Verify the original document has the expected attachment.
const originalAttachments = await pdfDoc.getAttachments();
expect(originalAttachments["foo.txt"]).toBeDefined();
const data = await pdfDoc.extractPages([
{ document: null, includePages: [0] },
]);
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
const attachments = await pdfDoc.getAttachments();
expect(attachments).not.toBeNull();
expect(attachments["foo.txt"]).toEqual({
rawFilename: "foo.txt",
filename: "foo.txt",
content: new Uint8Array([98, 97, 114, 32, 98, 97, 122, 32, 10]),
description: "",
});
await loadingTask.destroy();
});
it("preserves EmbeddedFiles (attachments) when merging two PDFs", async function () {
let loadingTask = getDocument(buildGetDocumentParams("attachment.pdf"));
let pdfDoc = await loadingTask.promise;
// Merge attachment.pdf with itself: both copies carry "foo.txt", so
// the second one should be deduplicated to "foo.txt_1".
const data = await pdfDoc.extractPages([
{ document: null },
{ document: null },
]);
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
const attachments = await pdfDoc.getAttachments();
expect(attachments).not.toBeNull();
const expectedContent = new Uint8Array([
98, 97, 114, 32, 98, 97, 122, 32, 10,
]);
expect(attachments["foo.txt"]).toEqual({
rawFilename: "foo.txt",
filename: "foo.txt",
content: expectedContent,
description: "",
});
expect(attachments["foo.txt_1"]).toEqual({
rawFilename: "foo.txt",
filename: "foo.txt",
content: expectedContent,
description: "",
});
await loadingTask.destroy();
});
});
describe("AcroForm", function () {