From 9d9fb06d7f60f11ce671224782a5c7be9b544052 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Wed, 17 Jun 2026 14:49:25 +0200 Subject: [PATCH] Re-derive annotation attachment content from the xref after cleanup Annotation-local attachments (those not in the catalog `/Names` tree) were resolved through a dictionary cache that `Catalog.cleanup` clears, so their content became unreachable once the idle cleanup had run. Encode the reference of the embedded content in the attachment id and re-fetch it from the xref on demand instead of caching the dictionary, so the content stays reachable without anything having to survive cleanup. It fixes a regression introduced by #21351. --- src/core/annotation.js | 37 +++++------ src/core/catalog.js | 117 +++++++++++++++++++++-------------- src/core/file_spec.js | 70 ++++++++++++--------- test/unit/annotation_spec.js | 105 ++++++++++++++++++++++++------- test/unit/api_spec.js | 28 +++++++++ 5 files changed, 239 insertions(+), 118 deletions(-) diff --git a/src/core/annotation.js b/src/core/annotation.js index b2f4cb82e..0fd275c74 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -5408,33 +5408,28 @@ class FileAttachmentAnnotation extends MarkupAnnotation { super(params); const { annotationGlobals, dict } = params; - const fileSpecRef = dict.getRaw("FS"); const fsDict = dict.get("FS"); const file = new FileSpec(fsDict); /** @type {{catalog?: Catalog}} */ const { catalog } = annotationGlobals.pdfManager.pdfDocument; - // When this annotation references an embedded file that’s already in the - // catalog `NameTree` (such as `EFOpen`), reuse that `NameTree` id so the - // sidebar and annotation paths resolve the same attachment identity. - let fileId = - fileSpecRef instanceof Ref - ? catalog?.attachmentIdByRef.get(fileSpecRef) - : undefined; - - // Fallback ids are namespaced to keep annotation-local ids distinct from - // `NameTree` ids (which are filename-based). - if (catalog && fsDict instanceof Dict && typeof fileId !== "string") { - const baseFileId = `annotation:${this.data.id}`; - fileId = baseFileId; - - let i = 1; - while (catalog.attachmentDictById.has(fileId)) { - fileId = `${baseFileId}-${i++}`; + // Encode the embedded content's reference in the id so it can be + // re-fetched from the xref on demand (see `Catalog.attachmentContent`) + // instead of being cached where `cleanup` would wipe it. The file-spec is + // usually indirect; when it's inline its embedded-file stream still isn't + // (streams are always indirect), so fall back to that ref. + let fileId; + if (fsDict instanceof Dict) { + let contentRef = dict.getRaw("FS"); + if (!(contentRef instanceof Ref)) { + contentRef = FileSpec.pickPlatformItem( + fsDict.get("EF"), + /* raw = */ true + ); + } + if (contentRef instanceof Ref) { + fileId = catalog?.getAttachmentIdForAnnotation(contentRef); } - - // Cache only fallbacks. - catalog.attachmentDictById.set(fileId, fsDict); } this.data.hasOwnCanvas = this.data.noRotate; diff --git a/src/core/catalog.js b/src/core/catalog.js index 7c2acbcf6..d66ecb943 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -119,18 +119,12 @@ function fetchRemoteDest(action) { class Catalog { #actualNumPages = null; - /** @type {RefSetCache | null} */ - #attachmentIdByRef = null; + #annotationAttachmentIdByRef = new RefSetCache(); + + #annotationAttachmentRefById = new Map(); #catDict = null; - /** - * Attachment dictionaries keyed by attachment id. - * - * @type {Map} - */ - attachmentDictById = new Map(); - builtInCMapCache = new Map(); fontCache = new RefSetCache(); @@ -164,33 +158,44 @@ class Catalog { this.toplevelPagesDict; // eslint-disable-line no-unused-expressions } - /** - * Attachment ids keyed by embedded-file reference. - * - * @type {RefSetCache} - */ - get attachmentIdByRef() { - if (this.#attachmentIdByRef) { - return this.#attachmentIdByRef; - } - - const attachmentIdByRef = new RefSetCache(); - for (const [name, ref] of this.rawEmbeddedFiles || []) { - if (!(ref instanceof Ref)) { - continue; - } - attachmentIdByRef.put( - ref, - stringToPDFString(name, /* keepEscapeSequence = */ true) - ); - } - return (this.#attachmentIdByRef = attachmentIdByRef); - } - cloneDict() { return this.#catDict.clone(); } + /** + * Create an id for an attachment from a FileAttachment annotation. + * + * The id is registered here rather than parsed from a public string prefix in + * `attachmentContent`, since catalog attachment names can be arbitrary PDF + * strings and may otherwise collide with annotation-local ids. + * + * @param {Ref} ref + * File-spec or embedded-file stream reference. + * @returns {string} + * Attachment id. + */ + getAttachmentIdForAnnotation(ref) { + let id = this.#annotationAttachmentIdByRef.get(ref); + if (id) { + return id; + } + + const baseId = `attachmentRef:${ref.toString()}`; + id = baseId; + + let i = 1; + while ( + this.#annotationAttachmentRefById.has(id) || + this.attachments?.has(id) + ) { + id = `${baseId}-${i++}`; + } + + this.#annotationAttachmentIdByRef.put(ref, id); + this.#annotationAttachmentRefById.set(id, ref); + return id; + } + get version() { const version = this.#catDict.get("Version"); if (version instanceof Name) { @@ -1157,19 +1162,12 @@ class Catalog { } /** - * Get content for an attachment. - * * @param {string} id - * Unique attachment identifier (required). - * @returns {CatalogAttachmentContent} - * Content. + * Unique attachment identifier. + * @returns {CatalogAttachmentContent | undefined} + * Content, or `undefined` when no named attachment exists for the id. */ - attachmentContent(id) { - const dict = this.attachmentDictById.get(id); - if (dict) { - return FileSpec.readContent(dict); - } - + #attachmentContentByName(id) { const obj = this.#catDict.get("Names"); if (obj instanceof Dict && obj.has("EmbeddedFiles")) { const nameTree = new NameTree(obj.getRaw("EmbeddedFiles"), this.xref); @@ -1179,6 +1177,36 @@ class Catalog { } } } + return undefined; + } + + /** + * Get content for an attachment. + * + * @param {string} id + * Unique attachment identifier (required). + * @returns {CatalogAttachmentContent} + * Content. + */ + attachmentContent(id) { + const namedContent = this.#attachmentContentByName(id); + if (namedContent !== undefined) { + return namedContent; + } + + // Annotation-local attachments register the reference of their embedded + // content in the catalog, so it's re-fetched from the xref on demand + // instead of being cached (which would then need to survive `cleanup`). + // The reference points either at the file-spec dictionary or, for an inline + // file-spec, straight at the embedded-file stream. + const ref = this.#annotationAttachmentRefById.get(id); + if (ref) { + const target = this.xref.fetch(ref); + if (target instanceof BaseStream) { + return FileSpec.readStreamContent(target); + } + return target instanceof Dict ? FileSpec.readContent(target) : null; + } return null; } @@ -1280,9 +1308,6 @@ class Catalog { async cleanup(manuallyTriggered = false) { clearGlobalCaches(); - this.#attachmentIdByRef?.clear(); - this.#attachmentIdByRef = null; - this.attachmentDictById.clear(); this.globalColorSpaceCache.clear(); this.globalImageCache.clear(/* onlyData = */ manuallyTriggered); this.pageKidsCountCache.clear(); diff --git a/src/core/file_spec.js b/src/core/file_spec.js index ee602f516..6409dc504 100644 --- a/src/core/file_spec.js +++ b/src/core/file_spec.js @@ -27,29 +27,6 @@ import { stringToPDFString } from "./string_utils.js"; * @import { CatalogAttachmentContent } from "./catalog.js"; */ -/** - * Get a platform-specific item from a file-spec dictionary. - * - * Search order follows the PDF platform keys: `UF`, `F`, `Unix`, `Mac`, - * `DOS`. - * - * @param {Dict | null | undefined} dict - * Dictionary. - * @returns {unknown} - * Matching dictionary value or `null` when no key is found. - */ -function pickPlatformItem(dict) { - if (dict instanceof Dict) { - // Look for the filename in this order: UF, F, Unix, Mac, DOS - for (const key of ["UF", "F", "Unix", "Mac", "DOS"]) { - if (dict.has(key)) { - return dict.get(key); - } - } - } - return null; -} - /** * "A PDF file can refer to the contents of another file by using a File * Specification (PDF 1.1)", see the spec (7.11) for more details. @@ -76,7 +53,7 @@ class FileSpec { } get filename() { - const item = pickPlatformItem(this.root); + const item = FileSpec.pickPlatformItem(this.root); if (item && typeof item === "string") { // NOTE: The following replacement order is INTENTIONAL, regardless of // what some static code analysers (e.g. CodeQL) may claim. @@ -105,6 +82,31 @@ class FileSpec { }; } + /** + * Get a platform-specific item from a file-spec dictionary. + * + * Search order follows the PDF platform keys: `UF`, `F`, `Unix`, `Mac`, + * `DOS`. + * + * @param {Dict | null | undefined} dict + * Dictionary. + * @param {boolean} [raw] + * Return the raw (possibly indirect) value rather than the resolved one. + * @returns {unknown} + * Matching dictionary value or `null` when no key is found. + */ + static pickPlatformItem(dict, raw = false) { + if (dict instanceof Dict) { + // Look for the filename in this order: UF, F, Unix, Mac, DOS + for (const key of ["UF", "F", "Unix", "Mac", "DOS"]) { + if (dict.has(key)) { + return raw ? dict.getRaw(key) : dict.get(key); + } + } + } + return null; + } + /** * Read attachment bytes from a file-spec dictionary. * @@ -119,24 +121,36 @@ class FileSpec { if (!(dict instanceof Dict)) { return null; } - const ef = pickPlatformItem(dict.get("EF")); + const ef = this.pickPlatformItem(dict.get("EF")); if (!(ef instanceof BaseStream)) { warn( "Embedded file specification points to non-existing/invalid content" ); return null; } + return this.readStreamContent(ef); + } + /** + * Read the bytes of an embedded-file stream. + * + * @param {BaseStream} stream + * Embedded-file stream. + * @returns {CatalogAttachmentContent} + * Attachment bytes. + * @throws {PasswordException} + * When the bytes are encrypted and no key is available. + */ + static readStreamContent(stream) { // Throw if we need a password but don’t have one. - const encrypt = dict.xref?.encrypt; + const encrypt = stream.dict?.xref?.encrypt; if (encrypt?.encryptionKey === null) { throw new PasswordException( "No password given", PasswordResponses.NEED_PASSWORD ); } - - return ef.getBytes(); + return stream.getBytes(); } } diff --git a/test/unit/annotation_spec.js b/test/unit/annotation_spec.js index 2744ca915..99fe01e78 100644 --- a/test/unit/annotation_spec.js +++ b/test/unit/annotation_spec.js @@ -26,6 +26,7 @@ import { AnnotationFieldFlag, AnnotationFlag, AnnotationType, + bytesToString, DrawOPS, OPS, RenderingIntentFlag, @@ -41,6 +42,7 @@ import { } from "./test_utils.js"; import { Dict, Name, Ref, RefSetCache } from "../../src/core/primitives.js"; import { Lexer, Parser } from "../../src/core/parser.js"; +import { Catalog } from "../../src/core/catalog.js"; import { FlateStream } from "../../src/core/flate_stream.js"; import { PartialEvaluator } from "../../src/core/evaluator.js"; import { StringStream } from "../../src/core/stream.js"; @@ -52,9 +54,10 @@ describe("annotation", function () { constructor(params) { this.pdfDocument = { catalog: { - attachmentDictById: new Map(), - attachmentIdByRef: new RefSetCache(), baseUrl: params.docBaseUrl || null, + getAttachmentIdForAnnotation(ref) { + return `attachmentRef:${ref.toString()}`; + }, }, }; this.evaluatorOptions = { @@ -4403,20 +4406,17 @@ describe("annotation", function () { idFactoryMock ); expect(data.annotationType).toEqual(AnnotationType.FILEATTACHMENT); - expect(data.fileId.startsWith("annotation:")).toEqual(true); + // The file-spec is an indirect object, so its reference is encoded in the + // id and re-fetched on demand. + expect(data.fileId).toEqual("attachmentRef:19R"); expect(data.file).toEqual({ rawFilename: "Test.txt", filename: "Test.txt", description: "abc", }); - - // Content lookup and reading requires a bigger mock than used here. - expect( - pdfManagerMock.pdfDocument.catalog.attachmentDictById.has(data.fileId) - ).toEqual(true); }); - it("should reuse the attachment NameTree id for referenced files", async function () { + it("should re-derive an inline file attachment from its embedded stream", async function () { const fileStream = new StringStream( "<<\n" + "/Type /EmbeddedFile\n" + @@ -4432,41 +4432,36 @@ describe("annotation", function () { allowStreams: true, }); - const fileStreamRef = Ref.get(28, 0); + const fileStreamRef = Ref.get(18, 0); const fileStreamDict = parser.getObj(); const embeddedFileDict = new Dict(); embeddedFileDict.set("F", fileStreamRef); - const fileSpecRef = Ref.get(29, 0); + // The file-spec is inline (not an indirect object), so the embedded-file + // stream's reference is encoded in the id instead. const fileSpecDict = new Dict(); fileSpecDict.set("Type", Name.get("Filespec")); fileSpecDict.set("Desc", "abc"); fileSpecDict.set("EF", embeddedFileDict); fileSpecDict.set("UF", "Test.txt"); - const fileAttachmentRef = Ref.get(30, 0); + const fileAttachmentRef = Ref.get(20, 0); const fileAttachmentDict = new Dict(); fileAttachmentDict.set("Type", Name.get("Annot")); fileAttachmentDict.set("Subtype", Name.get("FileAttachment")); - fileAttachmentDict.set("FS", fileSpecRef); + fileAttachmentDict.set("FS", fileSpecDict); fileAttachmentDict.set("T", "Topic"); fileAttachmentDict.set("Contents", "Test.txt"); const xref = new XRefMock([ { ref: fileStreamRef, data: fileStreamDict }, - { ref: fileSpecRef, data: fileSpecDict }, { ref: fileAttachmentRef, data: fileAttachmentDict }, ]); embeddedFileDict.assignXref(xref); fileSpecDict.assignXref(xref); fileAttachmentDict.assignXref(xref); - pdfManagerMock.pdfDocument.catalog.attachmentIdByRef.put( - fileSpecRef, - "Test.txt" - ); - const { data } = await AnnotationFactory.create( xref, fileAttachmentRef, @@ -4474,17 +4469,81 @@ describe("annotation", function () { idFactoryMock ); expect(data.annotationType).toEqual(AnnotationType.FILEATTACHMENT); - expect(data.fileId).toEqual("Test.txt"); + expect(data.fileId).toEqual("attachmentRef:18R"); expect(data.file).toEqual({ rawFilename: "Test.txt", filename: "Test.txt", description: "abc", }); + }); - // File should not be added as it’s already referenced in the `NameTree`. + it("should keep named attachment ids distinct from annotation attachment ids", function () { + const annotationStreamRef = Ref.get(18, 0); + const annotationStreamDict = new Dict(); + annotationStreamDict.set("Type", Name.get("EmbeddedFile")); + const annotationStream = new StringStream( + "Annotation attachment", + annotationStreamDict + ); + + const namedStreamRef = Ref.get(21, 0); + const namedStreamDict = new Dict(); + namedStreamDict.set("Type", Name.get("EmbeddedFile")); + const namedStream = new StringStream("Named attachment", namedStreamDict); + + const namedEmbeddedFileDict = new Dict(); + namedEmbeddedFileDict.set("F", namedStreamRef); + + const namedFileSpecRef = Ref.get(22, 0); + const namedFileSpecDict = new Dict(); + namedFileSpecDict.set("Type", Name.get("Filespec")); + namedFileSpecDict.set("EF", namedEmbeddedFileDict); + namedFileSpecDict.set("F", "Named.txt"); + + const pagesDict = new Dict(); + const embeddedFilesDict = new Dict(); + embeddedFilesDict.set("Names", ["attachmentRef:18R", namedFileSpecRef]); + + const namesDict = new Dict(); + namesDict.set("EmbeddedFiles", embeddedFilesDict); + + const catalogDict = new Dict(); + catalogDict.set("Pages", pagesDict); + catalogDict.set("Names", namesDict); + + const xref = new XRefMock([ + { ref: annotationStreamRef, data: annotationStream }, + { ref: namedStreamRef, data: namedStream }, + { ref: namedFileSpecRef, data: namedFileSpecDict }, + ]); + xref.getCatalogObj = () => catalogDict; + + for (const dict of [ + annotationStreamDict, + namedStreamDict, + namedEmbeddedFileDict, + namedFileSpecDict, + pagesDict, + embeddedFilesDict, + namesDict, + catalogDict, + ]) { + dict.assignXref(xref); + } + + const catalog = new Catalog(pdfManagerMock, xref); + const annotationId = + catalog.getAttachmentIdForAnnotation(annotationStreamRef); + + expect(annotationId).toEqual("attachmentRef:18R-1"); expect( - pdfManagerMock.pdfDocument.catalog.attachmentDictById.has(data.fileId) - ).toEqual(false); + bytesToString(catalog.attachmentContent("attachmentRef:18R")) + ).toEqual("Named attachment"); + expect(bytesToString(catalog.attachmentContent(annotationId))).toEqual( + "Annotation attachment" + ); + // An unknown id resolves to no content. + expect(catalog.attachmentContent("nonexistent")).toEqual(null); }); }); diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 86b9e2c9d..fa8542691 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3915,6 +3915,34 @@ describe("api", function () { await loadingTask.destroy(); }); + it("gets FileAttachment annotation content that stays readable after cleanup", async function () { + // The embedded files are reachable only via the annotations (no catalog + // `/Names` tree), so their content must survive `cleanup` by being + // re-derivable from the xref. + const loadingTask = getDocument(buildGetDocumentParams("bug1230933.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const annotations = await pdfPage.getAnnotations(); + + const fileAnnotation = annotations.find( + a => a.annotationType === AnnotationType.FILEATTACHMENT + ); + const { fileId } = fileAnnotation; + expect(fileId.startsWith("attachmentRef:")).toEqual(true); + + const before = await pdfDoc.getAttachmentContent(fileId); + expect(before).toBeInstanceOf(Uint8Array); + expect(before.length).toEqual(234414); + + await pdfDoc.cleanup(); + + const after = await pdfDoc.getAttachmentContent(fileId); + expect(after).toBeInstanceOf(Uint8Array); + expect(after.length).toEqual(234414); + + await loadingTask.destroy(); + }); + it("gets annotations containing /Launch action with /FileSpec dictionary (issue 17846)", async function () { const loadingTask = getDocument(buildGetDocumentParams("issue17846.pdf")); const pdfDoc = await loadingTask.promise;