Re-derive annotation attachment content from the xref after cleanup

Annotation-local attachments (those not in the catalog `/Names` tree) were
resolved through a dictionary cache that `Catalog.cleanup` clears, so their
content became unreachable once the idle cleanup had run.

Encode the reference of the embedded content in the attachment id and re-fetch
it from the xref on demand instead of caching the dictionary, so the content
stays reachable without anything having to survive cleanup.

It fixes a regression introduced by #21351.
This commit is contained in:
Calixte Denizet 2026-06-17 14:49:25 +02:00
parent 187c22126a
commit 9d9fb06d7f
5 changed files with 239 additions and 118 deletions

View File

@ -5408,33 +5408,28 @@ class FileAttachmentAnnotation extends MarkupAnnotation {
super(params);
const { annotationGlobals, dict } = params;
const fileSpecRef = dict.getRaw("FS");
const fsDict = dict.get("FS");
const file = new FileSpec(fsDict);
/** @type {{catalog?: Catalog}} */
const { catalog } = annotationGlobals.pdfManager.pdfDocument;
// When this annotation references an embedded file thats already in the
// catalog `NameTree` (such as `EFOpen`), reuse that `NameTree` id so the
// sidebar and annotation paths resolve the same attachment identity.
let fileId =
fileSpecRef instanceof Ref
? catalog?.attachmentIdByRef.get(fileSpecRef)
: undefined;
// Fallback ids are namespaced to keep annotation-local ids distinct from
// `NameTree` ids (which are filename-based).
if (catalog && fsDict instanceof Dict && typeof fileId !== "string") {
const baseFileId = `annotation:${this.data.id}`;
fileId = baseFileId;
let i = 1;
while (catalog.attachmentDictById.has(fileId)) {
fileId = `${baseFileId}-${i++}`;
// Encode the embedded content's reference in the id so it can be
// re-fetched from the xref on demand (see `Catalog.attachmentContent`)
// instead of being cached where `cleanup` would wipe it. The file-spec is
// usually indirect; when it's inline its embedded-file stream still isn't
// (streams are always indirect), so fall back to that ref.
let fileId;
if (fsDict instanceof Dict) {
let contentRef = dict.getRaw("FS");
if (!(contentRef instanceof Ref)) {
contentRef = FileSpec.pickPlatformItem(
fsDict.get("EF"),
/* raw = */ true
);
}
if (contentRef instanceof Ref) {
fileId = catalog?.getAttachmentIdForAnnotation(contentRef);
}
// Cache only fallbacks.
catalog.attachmentDictById.set(fileId, fsDict);
}
this.data.hasOwnCanvas = this.data.noRotate;

View File

@ -119,18 +119,12 @@ function fetchRemoteDest(action) {
class Catalog {
#actualNumPages = null;
/** @type {RefSetCache | null} */
#attachmentIdByRef = null;
#annotationAttachmentIdByRef = new RefSetCache();
#annotationAttachmentRefById = new Map();
#catDict = null;
/**
* Attachment dictionaries keyed by attachment id.
*
* @type {Map<string, Dict>}
*/
attachmentDictById = new Map();
builtInCMapCache = new Map();
fontCache = new RefSetCache();
@ -164,33 +158,44 @@ class Catalog {
this.toplevelPagesDict; // eslint-disable-line no-unused-expressions
}
/**
* Attachment ids keyed by embedded-file reference.
*
* @type {RefSetCache}
*/
get attachmentIdByRef() {
if (this.#attachmentIdByRef) {
return this.#attachmentIdByRef;
}
const attachmentIdByRef = new RefSetCache();
for (const [name, ref] of this.rawEmbeddedFiles || []) {
if (!(ref instanceof Ref)) {
continue;
}
attachmentIdByRef.put(
ref,
stringToPDFString(name, /* keepEscapeSequence = */ true)
);
}
return (this.#attachmentIdByRef = attachmentIdByRef);
}
cloneDict() {
return this.#catDict.clone();
}
/**
* Create an id for an attachment from a FileAttachment annotation.
*
* The id is registered here rather than parsed from a public string prefix in
* `attachmentContent`, since catalog attachment names can be arbitrary PDF
* strings and may otherwise collide with annotation-local ids.
*
* @param {Ref} ref
* File-spec or embedded-file stream reference.
* @returns {string}
* Attachment id.
*/
getAttachmentIdForAnnotation(ref) {
let id = this.#annotationAttachmentIdByRef.get(ref);
if (id) {
return id;
}
const baseId = `attachmentRef:${ref.toString()}`;
id = baseId;
let i = 1;
while (
this.#annotationAttachmentRefById.has(id) ||
this.attachments?.has(id)
) {
id = `${baseId}-${i++}`;
}
this.#annotationAttachmentIdByRef.put(ref, id);
this.#annotationAttachmentRefById.set(id, ref);
return id;
}
get version() {
const version = this.#catDict.get("Version");
if (version instanceof Name) {
@ -1157,19 +1162,12 @@ class Catalog {
}
/**
* Get content for an attachment.
*
* @param {string} id
* Unique attachment identifier (required).
* @returns {CatalogAttachmentContent}
* Content.
* Unique attachment identifier.
* @returns {CatalogAttachmentContent | undefined}
* Content, or `undefined` when no named attachment exists for the id.
*/
attachmentContent(id) {
const dict = this.attachmentDictById.get(id);
if (dict) {
return FileSpec.readContent(dict);
}
#attachmentContentByName(id) {
const obj = this.#catDict.get("Names");
if (obj instanceof Dict && obj.has("EmbeddedFiles")) {
const nameTree = new NameTree(obj.getRaw("EmbeddedFiles"), this.xref);
@ -1179,6 +1177,36 @@ class Catalog {
}
}
}
return undefined;
}
/**
* Get content for an attachment.
*
* @param {string} id
* Unique attachment identifier (required).
* @returns {CatalogAttachmentContent}
* Content.
*/
attachmentContent(id) {
const namedContent = this.#attachmentContentByName(id);
if (namedContent !== undefined) {
return namedContent;
}
// Annotation-local attachments register the reference of their embedded
// content in the catalog, so it's re-fetched from the xref on demand
// instead of being cached (which would then need to survive `cleanup`).
// The reference points either at the file-spec dictionary or, for an inline
// file-spec, straight at the embedded-file stream.
const ref = this.#annotationAttachmentRefById.get(id);
if (ref) {
const target = this.xref.fetch(ref);
if (target instanceof BaseStream) {
return FileSpec.readStreamContent(target);
}
return target instanceof Dict ? FileSpec.readContent(target) : null;
}
return null;
}
@ -1280,9 +1308,6 @@ class Catalog {
async cleanup(manuallyTriggered = false) {
clearGlobalCaches();
this.#attachmentIdByRef?.clear();
this.#attachmentIdByRef = null;
this.attachmentDictById.clear();
this.globalColorSpaceCache.clear();
this.globalImageCache.clear(/* onlyData = */ manuallyTriggered);
this.pageKidsCountCache.clear();

View File

@ -27,29 +27,6 @@ import { stringToPDFString } from "./string_utils.js";
* @import { CatalogAttachmentContent } from "./catalog.js";
*/
/**
* Get a platform-specific item from a file-spec dictionary.
*
* Search order follows the PDF platform keys: `UF`, `F`, `Unix`, `Mac`,
* `DOS`.
*
* @param {Dict | null | undefined} dict
* Dictionary.
* @returns {unknown}
* Matching dictionary value or `null` when no key is found.
*/
function pickPlatformItem(dict) {
if (dict instanceof Dict) {
// Look for the filename in this order: UF, F, Unix, Mac, DOS
for (const key of ["UF", "F", "Unix", "Mac", "DOS"]) {
if (dict.has(key)) {
return dict.get(key);
}
}
}
return null;
}
/**
* "A PDF file can refer to the contents of another file by using a File
* Specification (PDF 1.1)", see the spec (7.11) for more details.
@ -76,7 +53,7 @@ class FileSpec {
}
get filename() {
const item = pickPlatformItem(this.root);
const item = FileSpec.pickPlatformItem(this.root);
if (item && typeof item === "string") {
// NOTE: The following replacement order is INTENTIONAL, regardless of
// what some static code analysers (e.g. CodeQL) may claim.
@ -105,6 +82,31 @@ class FileSpec {
};
}
/**
* Get a platform-specific item from a file-spec dictionary.
*
* Search order follows the PDF platform keys: `UF`, `F`, `Unix`, `Mac`,
* `DOS`.
*
* @param {Dict | null | undefined} dict
* Dictionary.
* @param {boolean} [raw]
* Return the raw (possibly indirect) value rather than the resolved one.
* @returns {unknown}
* Matching dictionary value or `null` when no key is found.
*/
static pickPlatformItem(dict, raw = false) {
if (dict instanceof Dict) {
// Look for the filename in this order: UF, F, Unix, Mac, DOS
for (const key of ["UF", "F", "Unix", "Mac", "DOS"]) {
if (dict.has(key)) {
return raw ? dict.getRaw(key) : dict.get(key);
}
}
}
return null;
}
/**
* Read attachment bytes from a file-spec dictionary.
*
@ -119,24 +121,36 @@ class FileSpec {
if (!(dict instanceof Dict)) {
return null;
}
const ef = pickPlatformItem(dict.get("EF"));
const ef = this.pickPlatformItem(dict.get("EF"));
if (!(ef instanceof BaseStream)) {
warn(
"Embedded file specification points to non-existing/invalid content"
);
return null;
}
return this.readStreamContent(ef);
}
/**
* Read the bytes of an embedded-file stream.
*
* @param {BaseStream} stream
* Embedded-file stream.
* @returns {CatalogAttachmentContent}
* Attachment bytes.
* @throws {PasswordException}
* When the bytes are encrypted and no key is available.
*/
static readStreamContent(stream) {
// Throw if we need a password but dont have one.
const encrypt = dict.xref?.encrypt;
const encrypt = stream.dict?.xref?.encrypt;
if (encrypt?.encryptionKey === null) {
throw new PasswordException(
"No password given",
PasswordResponses.NEED_PASSWORD
);
}
return ef.getBytes();
return stream.getBytes();
}
}

View File

@ -26,6 +26,7 @@ import {
AnnotationFieldFlag,
AnnotationFlag,
AnnotationType,
bytesToString,
DrawOPS,
OPS,
RenderingIntentFlag,
@ -41,6 +42,7 @@ import {
} from "./test_utils.js";
import { Dict, Name, Ref, RefSetCache } from "../../src/core/primitives.js";
import { Lexer, Parser } from "../../src/core/parser.js";
import { Catalog } from "../../src/core/catalog.js";
import { FlateStream } from "../../src/core/flate_stream.js";
import { PartialEvaluator } from "../../src/core/evaluator.js";
import { StringStream } from "../../src/core/stream.js";
@ -52,9 +54,10 @@ describe("annotation", function () {
constructor(params) {
this.pdfDocument = {
catalog: {
attachmentDictById: new Map(),
attachmentIdByRef: new RefSetCache(),
baseUrl: params.docBaseUrl || null,
getAttachmentIdForAnnotation(ref) {
return `attachmentRef:${ref.toString()}`;
},
},
};
this.evaluatorOptions = {
@ -4403,20 +4406,17 @@ describe("annotation", function () {
idFactoryMock
);
expect(data.annotationType).toEqual(AnnotationType.FILEATTACHMENT);
expect(data.fileId.startsWith("annotation:")).toEqual(true);
// The file-spec is an indirect object, so its reference is encoded in the
// id and re-fetched on demand.
expect(data.fileId).toEqual("attachmentRef:19R");
expect(data.file).toEqual({
rawFilename: "Test.txt",
filename: "Test.txt",
description: "abc",
});
// Content lookup and reading requires a bigger mock than used here.
expect(
pdfManagerMock.pdfDocument.catalog.attachmentDictById.has(data.fileId)
).toEqual(true);
});
it("should reuse the attachment NameTree id for referenced files", async function () {
it("should re-derive an inline file attachment from its embedded stream", async function () {
const fileStream = new StringStream(
"<<\n" +
"/Type /EmbeddedFile\n" +
@ -4432,41 +4432,36 @@ describe("annotation", function () {
allowStreams: true,
});
const fileStreamRef = Ref.get(28, 0);
const fileStreamRef = Ref.get(18, 0);
const fileStreamDict = parser.getObj();
const embeddedFileDict = new Dict();
embeddedFileDict.set("F", fileStreamRef);
const fileSpecRef = Ref.get(29, 0);
// The file-spec is inline (not an indirect object), so the embedded-file
// stream's reference is encoded in the id instead.
const fileSpecDict = new Dict();
fileSpecDict.set("Type", Name.get("Filespec"));
fileSpecDict.set("Desc", "abc");
fileSpecDict.set("EF", embeddedFileDict);
fileSpecDict.set("UF", "Test.txt");
const fileAttachmentRef = Ref.get(30, 0);
const fileAttachmentRef = Ref.get(20, 0);
const fileAttachmentDict = new Dict();
fileAttachmentDict.set("Type", Name.get("Annot"));
fileAttachmentDict.set("Subtype", Name.get("FileAttachment"));
fileAttachmentDict.set("FS", fileSpecRef);
fileAttachmentDict.set("FS", fileSpecDict);
fileAttachmentDict.set("T", "Topic");
fileAttachmentDict.set("Contents", "Test.txt");
const xref = new XRefMock([
{ ref: fileStreamRef, data: fileStreamDict },
{ ref: fileSpecRef, data: fileSpecDict },
{ ref: fileAttachmentRef, data: fileAttachmentDict },
]);
embeddedFileDict.assignXref(xref);
fileSpecDict.assignXref(xref);
fileAttachmentDict.assignXref(xref);
pdfManagerMock.pdfDocument.catalog.attachmentIdByRef.put(
fileSpecRef,
"Test.txt"
);
const { data } = await AnnotationFactory.create(
xref,
fileAttachmentRef,
@ -4474,17 +4469,81 @@ describe("annotation", function () {
idFactoryMock
);
expect(data.annotationType).toEqual(AnnotationType.FILEATTACHMENT);
expect(data.fileId).toEqual("Test.txt");
expect(data.fileId).toEqual("attachmentRef:18R");
expect(data.file).toEqual({
rawFilename: "Test.txt",
filename: "Test.txt",
description: "abc",
});
});
// File should not be added as its already referenced in the `NameTree`.
it("should keep named attachment ids distinct from annotation attachment ids", function () {
const annotationStreamRef = Ref.get(18, 0);
const annotationStreamDict = new Dict();
annotationStreamDict.set("Type", Name.get("EmbeddedFile"));
const annotationStream = new StringStream(
"Annotation attachment",
annotationStreamDict
);
const namedStreamRef = Ref.get(21, 0);
const namedStreamDict = new Dict();
namedStreamDict.set("Type", Name.get("EmbeddedFile"));
const namedStream = new StringStream("Named attachment", namedStreamDict);
const namedEmbeddedFileDict = new Dict();
namedEmbeddedFileDict.set("F", namedStreamRef);
const namedFileSpecRef = Ref.get(22, 0);
const namedFileSpecDict = new Dict();
namedFileSpecDict.set("Type", Name.get("Filespec"));
namedFileSpecDict.set("EF", namedEmbeddedFileDict);
namedFileSpecDict.set("F", "Named.txt");
const pagesDict = new Dict();
const embeddedFilesDict = new Dict();
embeddedFilesDict.set("Names", ["attachmentRef:18R", namedFileSpecRef]);
const namesDict = new Dict();
namesDict.set("EmbeddedFiles", embeddedFilesDict);
const catalogDict = new Dict();
catalogDict.set("Pages", pagesDict);
catalogDict.set("Names", namesDict);
const xref = new XRefMock([
{ ref: annotationStreamRef, data: annotationStream },
{ ref: namedStreamRef, data: namedStream },
{ ref: namedFileSpecRef, data: namedFileSpecDict },
]);
xref.getCatalogObj = () => catalogDict;
for (const dict of [
annotationStreamDict,
namedStreamDict,
namedEmbeddedFileDict,
namedFileSpecDict,
pagesDict,
embeddedFilesDict,
namesDict,
catalogDict,
]) {
dict.assignXref(xref);
}
const catalog = new Catalog(pdfManagerMock, xref);
const annotationId =
catalog.getAttachmentIdForAnnotation(annotationStreamRef);
expect(annotationId).toEqual("attachmentRef:18R-1");
expect(
pdfManagerMock.pdfDocument.catalog.attachmentDictById.has(data.fileId)
).toEqual(false);
bytesToString(catalog.attachmentContent("attachmentRef:18R"))
).toEqual("Named attachment");
expect(bytesToString(catalog.attachmentContent(annotationId))).toEqual(
"Annotation attachment"
);
// An unknown id resolves to no content.
expect(catalog.attachmentContent("nonexistent")).toEqual(null);
});
});

View File

@ -3915,6 +3915,34 @@ describe("api", function () {
await loadingTask.destroy();
});
it("gets FileAttachment annotation content that stays readable after cleanup", async function () {
// The embedded files are reachable only via the annotations (no catalog
// `/Names` tree), so their content must survive `cleanup` by being
// re-derivable from the xref.
const loadingTask = getDocument(buildGetDocumentParams("bug1230933.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const annotations = await pdfPage.getAnnotations();
const fileAnnotation = annotations.find(
a => a.annotationType === AnnotationType.FILEATTACHMENT
);
const { fileId } = fileAnnotation;
expect(fileId.startsWith("attachmentRef:")).toEqual(true);
const before = await pdfDoc.getAttachmentContent(fileId);
expect(before).toBeInstanceOf(Uint8Array);
expect(before.length).toEqual(234414);
await pdfDoc.cleanup();
const after = await pdfDoc.getAttachmentContent(fileId);
expect(after).toBeInstanceOf(Uint8Array);
expect(after.length).toEqual(234414);
await loadingTask.destroy();
});
it("gets annotations containing /Launch action with /FileSpec dictionary (issue 17846)", async function () {
const loadingTask = getDocument(buildGetDocumentParams("issue17846.pdf"));
const pdfDoc = await loadingTask.promise;