Merge pull request #21379 from calixteman/dedup_stream_merging

Deduplicate shared font/image streams when merging PDFs
This commit is contained in:
Tim van der Meij 2026-06-04 20:58:22 +02:00 committed by GitHub
commit 23ea0810d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 341 additions and 15 deletions

View File

@ -29,14 +29,15 @@ import {
} from "../core_utils.js";
import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js";
import { incrementalUpdate, writeValue } from "../writer.js";
import { isArrayEqual, stringToBytes } from "../../shared/util.js";
import { NameTree, NumberTree } from "../name_number_tree.js";
import { stringToAsciiOrUTF16BE, stringToPDFString } from "../string_utils.js";
import { AnnotationFactory } from "../annotation.js";
import { BaseStream } from "../base_stream.js";
import { createImage } from "./pdf_images.js";
import { LETTER_SIZE_MEDIABOX } from "../document.js";
import { MurmurHash3_64 } from "../../shared/murmurhash3.js";
import { StringStream } from "../stream.js";
import { stringToBytes } from "../../shared/util.js";
const MAX_LEAVES_PER_PAGES_NODE = 16;
const MAX_IN_NAME_TREE_NODE = 64;
@ -63,6 +64,7 @@ class DocumentData {
this.dedupNamedDestinations = new Map();
this.usedNamedDestinations = new Set();
this.postponedRefCopies = new RefSetCache();
this.resourceStreamPromises = new Map();
this.usedStructParents = new Set();
this.oldStructParentMapping = new Map();
this.structTreeRoot = null;
@ -126,6 +128,12 @@ class PDFEditor {
#primaryDocument = null;
// Deduplicates resource streams (fonts/images) shared across the merged
// documents. Maps a cheap content key to a bucket of { ref, dictStr, stream }
// candidates; the key only groups possible matches, an exact byte comparison
// decides, so a key collision can never alias two distinct resources.
#resourceStreamCache = new Map();
currentDocument = null;
oldPages = [];
@ -232,16 +240,22 @@ class PDFEditor {
* @param {*} obj
* @param {boolean} mustClone
* @param {XRef} xref
* @param {RefSet} resourceStreamPath
* @returns {Promise<*>}
*/
async #collectDependencies(obj, mustClone, xref) {
async #collectDependencies(
obj,
mustClone,
xref,
resourceStreamPath = new RefSet()
) {
if (obj instanceof Ref) {
const {
currentDocument: { oldRefMapping },
} = this;
let newRef = oldRefMapping.get(obj);
if (newRef) {
return newRef;
const existingRef = oldRefMapping.get(obj);
if (existingRef) {
return existingRef;
}
const oldRef = obj;
obj = await xref.fetchAsync(oldRef);
@ -250,7 +264,19 @@ class PDFEditor {
return obj;
}
newRef = this.newRef;
// Deduplicate fonts/images against earlier copies (common when merging
// exports of the same template). Reusing a copy costs no reference, so
// allocation is deferred to #collectResourceStream until it's known new.
if (obj instanceof BaseStream && this.#isResourceStream(obj.dict)) {
return this.#collectResourceStream(
oldRef,
obj,
xref,
resourceStreamPath
);
}
const newRef = this.newRef;
oldRefMapping.put(oldRef, newRef);
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
@ -265,7 +291,12 @@ class PDFEditor {
}
}
this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref);
this.xref[newRef.num] = await this.#collectDependencies(
obj,
true,
xref,
resourceStreamPath
);
return newRef;
}
const promises = [];
@ -285,9 +316,12 @@ class PDFEditor {
continue;
}
promises.push(
this.#collectDependencies(obj[i], true, xref).then(
newObj => (obj[i] = newObj)
)
this.#collectDependencies(
obj[i],
true,
xref,
resourceStreamPath
).then(newObj => (obj[i] = newObj))
);
}
await Promise.all(promises);
@ -314,9 +348,12 @@ class PDFEditor {
continue;
}
promises.push(
this.#collectDependencies(rawObj, true, xref).then(newObj =>
dict.set(key, newObj)
)
this.#collectDependencies(
rawObj,
true,
xref,
resourceStreamPath
).then(newObj => dict.set(key, newObj))
);
}
await Promise.all(promises);
@ -325,6 +362,175 @@ class PDFEditor {
return obj;
}
/**
* Whether a stream is worth deduplicating: an image or an embedded font
* program (large and often shared). Per-page content streams etc. are
* essentially never shared, so hashing them would be wasted work.
* @param {Dict} dict
* @returns {boolean}
*/
#isResourceStream(dict) {
const subtype = dict.get("Subtype");
return (
isName(subtype, "Image") ||
// FontFile/FontFile2 carry Length1; FontFile3 has one of these Subtypes.
dict.has("Length1") ||
isName(subtype, "Type1C") ||
isName(subtype, "CIDFontType0C") ||
isName(subtype, "OpenType")
);
}
/**
* Read the raw, still-encoded bytes of a stream.
* @param {BaseStream} stream
* @returns {Uint8Array}
*/
#rawStreamBytes(stream) {
const original = stream.getOriginalStream();
original.reset();
return original.getBytes();
}
/**
* Serialize a dictionary to a canonical string. Two clones of the same source
* dict serialize identically, so this works as a bucket key and as an exact
* comparison.
* @param {Dict} dict
* @returns {Promise<string>}
*/
async #serializeDict(dict) {
const buffer = [];
await writeValue(dict, buffer, /* transform = */ null);
return buffer.join("");
}
/**
* Cheap bucket key for a resource stream: the serialized dict, the byte
* length, and a few sampled chunks (so large payloads aren't fully hashed).
* Collisions only group candidates that are then compared byte-for-byte, so
* they cost time but never cause a wrong merge.
* @param {string} dictStr
* @param {Uint8Array} bytes
* @returns {string}
*/
#resourceStreamKey(dictStr, bytes) {
const SAMPLE_SIZE = 256;
const SAMPLE_COUNT = 4;
const { length } = bytes;
const hash = new MurmurHash3_64();
hash.update(dictStr);
hash.update(`#${length}`);
if (length <= SAMPLE_SIZE * SAMPLE_COUNT) {
hash.update(bytes);
} else {
const step = Math.floor((length - SAMPLE_SIZE) / (SAMPLE_COUNT - 1));
for (let i = 0; i < SAMPLE_COUNT; i++) {
const start = Math.min(i * step, length - SAMPLE_SIZE);
hash.update(bytes.subarray(start, start + SAMPLE_SIZE));
}
}
return hash.hexdigest();
}
/**
* Clone a resource stream and return its output reference, reusing an earlier
* copy when possible. The reference is allocated lazily (in
* #dedupResourceStream), so a reused resource leaves no unused reference.
* @param {Ref} oldRef
* @param {BaseStream} stream
* @param {XRef} xref
* @param {RefSet} resourceStreamPath
* @returns {Promise<Ref>}
*/
async #collectResourceStream(oldRef, stream, xref, resourceStreamPath) {
const {
currentDocument: { oldRefMapping, resourceStreamPromises },
} = this;
// Re-entry means a (malformed) cycle back to this stream: allocate its
// reference now to break the loop, like the generic path's eager alloc.
if (resourceStreamPath.has(oldRef)) {
let ref = oldRefMapping.get(oldRef);
if (!ref) {
ref = this.newRef;
oldRefMapping.put(oldRef, ref);
}
return ref;
}
const key = oldRef.toString();
const pending = resourceStreamPromises.get(key);
if (pending) {
return pending;
}
// The path only grows here, so the shared parent path can be passed
// read-only everywhere else; snapshot it, add this stream, and recurse.
const childPath = new RefSet(resourceStreamPath);
childPath.put(oldRef);
const promise = Promise.resolve().then(async () => {
const collected = await this.#collectDependencies(
stream,
true,
xref,
childPath
);
// A cycle already allocated a reference, so store the clone there.
const cycleRef = oldRefMapping.get(oldRef);
if (cycleRef) {
this.xref[cycleRef.num] = collected;
return cycleRef;
}
const ref = await this.#dedupResourceStream(collected);
oldRefMapping.put(oldRef, ref);
return ref;
});
resourceStreamPromises.set(key, promise);
try {
return await promise;
} finally {
if (resourceStreamPromises.get(key) === promise) {
resourceStreamPromises.delete(key);
}
}
}
/**
* Return the reference for a cloned resource stream, reusing a byte-identical
* earlier copy or else allocating and registering a new one.
* @param {BaseStream} stream
* @returns {Promise<Ref>}
*/
async #dedupResourceStream(stream) {
const dictStr = await this.#serializeDict(stream.dict);
const bytes = this.#rawStreamBytes(stream);
const key = this.#resourceStreamKey(dictStr, bytes);
let bucket = this.#resourceStreamCache.get(key);
if (bucket) {
// Same key only means "maybe equal": confirm with an exact comparison.
for (const entry of bucket) {
if (
entry.dictStr === dictStr &&
isArrayEqual(this.#rawStreamBytes(entry.stream), bytes)
) {
return entry.ref;
}
}
} else {
bucket = [];
this.#resourceStreamCache.set(key, bucket);
}
const ref = this.newRef;
this.xref[ref.num] = stream;
bucket.push({ ref, dictStr, stream });
return ref;
}
async #cloneStructTreeNode(
parentStructRef,
node,

View File

@ -75,14 +75,17 @@ async function writeStream(stream, buffer, transform) {
isName(filterZero, "JBIG2Decode") ||
isName(filterZero, "CCITTFaxDecode") ||
isName(filterZero, "LZWDecode");
const isFilterZeroCompressedObject =
isFilterZeroFlateDecode ||
isFilterZeroImageDecode ||
isName(filterZero, "BrotliDecode");
// If the string is too small there is no real benefit in compressing it.
// The number 256 is arbitrary, but it should be reasonable.
const MIN_LENGTH_FOR_COMPRESSING = 256;
if (
!isFilterZeroFlateDecode &&
!isFilterZeroImageDecode &&
!isFilterZeroCompressedObject &&
bytes.length >= MIN_LENGTH_FOR_COMPRESSING
) {
try {

View File

@ -27,6 +27,7 @@ import {
PasswordResponses,
PermissionFlag,
ResponseException,
stringToBytes,
UnknownErrorException,
} from "../../src/shared/util.js";
import {
@ -88,6 +89,57 @@ describe("api", function () {
.join("");
}
function countMarker(bytes, marker) {
let count = 0;
for (let i = 0, ii = bytes.length - marker.length; i <= ii; i++) {
let j = 0;
while (j < marker.length && bytes[i + j] === marker.charCodeAt(j)) {
j++;
}
if (j === marker.length) {
count++;
}
}
return count;
}
function buildSharedImageResourcePdf() {
const streamObject = (num, dict, data) =>
`${num} 0 obj\n<< ${dict} /Length ${data.length} >>\n` +
`stream\n${data}\nendstream\nendobj\n`;
const objects = [
"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 10 10] " +
"/Resources << /XObject << /Im0 4 0 R /Im1 4 0 R >> >> " +
"/Contents 5 0 R >>\nendobj\n",
streamObject(
4,
"/Type /XObject /Subtype /Image /Width 1 /Height 1 " +
"/ColorSpace /DeviceRGB /BitsPerComponent 8 /Filter /ASCIIHexDecode",
"FF0000>"
),
streamObject(5, "", "q 10 0 0 10 0 0 cm /Im0 Do Q"),
];
let pdf = "%PDF-1.7\n";
const offsets = [];
for (const obj of objects) {
offsets.push(pdf.length);
pdf += obj;
}
const xrefOffset = pdf.length;
pdf += `xref\n0 ${objects.length + 1}\n`;
pdf += "0000000000 65535 f \n";
for (const offset of offsets) {
pdf += `${offset.toString().padStart(10, "0")} 00000 n \n`;
}
pdf +=
`trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\n` +
`startxref\n${xrefOffset}\n%%EOF\n`;
return stringToBytes(pdf);
}
function getNamedNodeInXML(node, path) {
for (const component of path.split(".")) {
if (!node.childNodes) {
@ -5873,6 +5925,71 @@ small scripts as well as for`);
await loadingTask.destroy();
});
it("deduplicates resource streams shared across merged copies", async function () {
const MARKER = "/CIDFontType0C";
const loadingTask = getDocument(
buildGetDocumentParams("doc_1_3_pages.pdf")
);
const pdfDoc = await loadingTask.promise;
// The same buffer loaded several times yields distinct worker-side
// documents, so this is a true cross-document merge of identical data.
const pdfData = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_2_3_pages.pdf",
});
// Baseline: the font programs in one duplicate-free copy.
const dataSingle = await pdfDoc.extractPages([{ document: pdfData }]);
const fontsSingle = countMarker(dataSingle, MARKER);
expect(fontsSingle).toBeGreaterThan(0);
// Four identical copies must share them (a naive merge would write 4x).
const COPIES = 4;
const dataMany = await pdfDoc.extractPages(
new Array(COPIES).fill(0).map(() => ({ document: pdfData }))
);
expect(countMarker(dataMany, MARKER)).toEqual(fontsSingle);
// The merged document must still be valid and render every page.
const newLoadingTask = getDocument({ data: dataMany });
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(3 * COPIES);
for (let i = 1; i <= 3 * COPIES; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document 2:Page ${((i - 1) % 3) + 1}`
);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("deduplicates resource streams reached concurrently", async function () {
const pdfData = buildSharedImageResourcePdf();
const loadingTask = getDocument({ data: pdfData.slice() });
const pdfDoc = await loadingTask.promise;
// The source page has two XObject names pointing at the same image
// stream. Those references are cloned concurrently from the resource
// dictionary, and must not be mistaken for a reference cycle.
const data = await pdfDoc.extractPages([
{ document: pdfData.slice() },
{ document: pdfData.slice() },
]);
expect(countMarker(data, "/Subtype /Image")).toEqual(1);
const newLoadingTask = getDocument({ data });
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(2);
await newPdfDoc.getPage(1);
await newPdfDoc.getPage(2);
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with page included ranges", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("tracemonkey.pdf")