From 1a7821ab1350e5eff1f339e92e48eff5c023fcff Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Tue, 2 Jun 2026 22:08:16 +0200 Subject: [PATCH] Deduplicate shared font/image streams when merging PDFs Identical embedded fonts and images across the merged documents are now written once and shared, instead of being copied per source file. And avoid to compress already compressed stream with Brotli. --- src/core/editor/pdf_editor.js | 232 ++++++++++++++++++++++++++++++++-- src/core/writer.js | 7 +- test/unit/api_spec.js | 117 +++++++++++++++++ 3 files changed, 341 insertions(+), 15 deletions(-) diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index f1945f41b..2ed888c73 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -29,14 +29,15 @@ import { } from "../core_utils.js"; import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js"; import { incrementalUpdate, writeValue } from "../writer.js"; +import { isArrayEqual, stringToBytes } from "../../shared/util.js"; import { NameTree, NumberTree } from "../name_number_tree.js"; import { stringToAsciiOrUTF16BE, stringToPDFString } from "../string_utils.js"; import { AnnotationFactory } from "../annotation.js"; import { BaseStream } from "../base_stream.js"; import { createImage } from "./pdf_images.js"; import { LETTER_SIZE_MEDIABOX } from "../document.js"; +import { MurmurHash3_64 } from "../../shared/murmurhash3.js"; import { StringStream } from "../stream.js"; -import { stringToBytes } from "../../shared/util.js"; const MAX_LEAVES_PER_PAGES_NODE = 16; const MAX_IN_NAME_TREE_NODE = 64; @@ -63,6 +64,7 @@ class DocumentData { this.dedupNamedDestinations = new Map(); this.usedNamedDestinations = new Set(); this.postponedRefCopies = new RefSetCache(); + this.resourceStreamPromises = new Map(); this.usedStructParents = new Set(); this.oldStructParentMapping = new Map(); this.structTreeRoot = null; @@ -126,6 +128,12 @@ class PDFEditor { #primaryDocument = null; + // Deduplicates resource streams (fonts/images) shared across the merged + // documents. Maps a cheap content key to a bucket of { ref, dictStr, stream } + // candidates; the key only groups possible matches, an exact byte comparison + // decides, so a key collision can never alias two distinct resources. + #resourceStreamCache = new Map(); + currentDocument = null; oldPages = []; @@ -232,16 +240,22 @@ class PDFEditor { * @param {*} obj * @param {boolean} mustClone * @param {XRef} xref + * @param {RefSet} resourceStreamPath * @returns {Promise<*>} */ - async #collectDependencies(obj, mustClone, xref) { + async #collectDependencies( + obj, + mustClone, + xref, + resourceStreamPath = new RefSet() + ) { if (obj instanceof Ref) { const { currentDocument: { oldRefMapping }, } = this; - let newRef = oldRefMapping.get(obj); - if (newRef) { - return newRef; + const existingRef = oldRefMapping.get(obj); + if (existingRef) { + return existingRef; } const oldRef = obj; obj = await xref.fetchAsync(oldRef); @@ -250,7 +264,19 @@ class PDFEditor { return obj; } - newRef = this.newRef; + // Deduplicate fonts/images against earlier copies (common when merging + // exports of the same template). Reusing a copy costs no reference, so + // allocation is deferred to #collectResourceStream until it's known new. + if (obj instanceof BaseStream && this.#isResourceStream(obj.dict)) { + return this.#collectResourceStream( + oldRef, + obj, + xref, + resourceStreamPath + ); + } + + const newRef = this.newRef; oldRefMapping.put(oldRef, newRef); if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) { @@ -265,7 +291,12 @@ class PDFEditor { } } - this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref); + this.xref[newRef.num] = await this.#collectDependencies( + obj, + true, + xref, + resourceStreamPath + ); return newRef; } const promises = []; @@ -285,9 +316,12 @@ class PDFEditor { continue; } promises.push( - this.#collectDependencies(obj[i], true, xref).then( - newObj => (obj[i] = newObj) - ) + this.#collectDependencies( + obj[i], + true, + xref, + resourceStreamPath + ).then(newObj => (obj[i] = newObj)) ); } await Promise.all(promises); @@ -314,9 +348,12 @@ class PDFEditor { continue; } promises.push( - this.#collectDependencies(rawObj, true, xref).then(newObj => - dict.set(key, newObj) - ) + this.#collectDependencies( + rawObj, + true, + xref, + resourceStreamPath + ).then(newObj => dict.set(key, newObj)) ); } await Promise.all(promises); @@ -325,6 +362,175 @@ class PDFEditor { return obj; } + /** + * Whether a stream is worth deduplicating: an image or an embedded font + * program (large and often shared). Per-page content streams etc. are + * essentially never shared, so hashing them would be wasted work. + * @param {Dict} dict + * @returns {boolean} + */ + #isResourceStream(dict) { + const subtype = dict.get("Subtype"); + return ( + isName(subtype, "Image") || + // FontFile/FontFile2 carry Length1; FontFile3 has one of these Subtypes. + dict.has("Length1") || + isName(subtype, "Type1C") || + isName(subtype, "CIDFontType0C") || + isName(subtype, "OpenType") + ); + } + + /** + * Read the raw, still-encoded bytes of a stream. + * @param {BaseStream} stream + * @returns {Uint8Array} + */ + #rawStreamBytes(stream) { + const original = stream.getOriginalStream(); + original.reset(); + return original.getBytes(); + } + + /** + * Serialize a dictionary to a canonical string. Two clones of the same source + * dict serialize identically, so this works as a bucket key and as an exact + * comparison. + * @param {Dict} dict + * @returns {Promise} + */ + async #serializeDict(dict) { + const buffer = []; + await writeValue(dict, buffer, /* transform = */ null); + return buffer.join(""); + } + + /** + * Cheap bucket key for a resource stream: the serialized dict, the byte + * length, and a few sampled chunks (so large payloads aren't fully hashed). + * Collisions only group candidates that are then compared byte-for-byte, so + * they cost time but never cause a wrong merge. + * @param {string} dictStr + * @param {Uint8Array} bytes + * @returns {string} + */ + #resourceStreamKey(dictStr, bytes) { + const SAMPLE_SIZE = 256; + const SAMPLE_COUNT = 4; + const { length } = bytes; + const hash = new MurmurHash3_64(); + hash.update(dictStr); + hash.update(`#${length}`); + if (length <= SAMPLE_SIZE * SAMPLE_COUNT) { + hash.update(bytes); + } else { + const step = Math.floor((length - SAMPLE_SIZE) / (SAMPLE_COUNT - 1)); + for (let i = 0; i < SAMPLE_COUNT; i++) { + const start = Math.min(i * step, length - SAMPLE_SIZE); + hash.update(bytes.subarray(start, start + SAMPLE_SIZE)); + } + } + return hash.hexdigest(); + } + + /** + * Clone a resource stream and return its output reference, reusing an earlier + * copy when possible. The reference is allocated lazily (in + * #dedupResourceStream), so a reused resource leaves no unused reference. + * @param {Ref} oldRef + * @param {BaseStream} stream + * @param {XRef} xref + * @param {RefSet} resourceStreamPath + * @returns {Promise} + */ + async #collectResourceStream(oldRef, stream, xref, resourceStreamPath) { + const { + currentDocument: { oldRefMapping, resourceStreamPromises }, + } = this; + + // Re-entry means a (malformed) cycle back to this stream: allocate its + // reference now to break the loop, like the generic path's eager alloc. + if (resourceStreamPath.has(oldRef)) { + let ref = oldRefMapping.get(oldRef); + if (!ref) { + ref = this.newRef; + oldRefMapping.put(oldRef, ref); + } + return ref; + } + + const key = oldRef.toString(); + const pending = resourceStreamPromises.get(key); + if (pending) { + return pending; + } + + // The path only grows here, so the shared parent path can be passed + // read-only everywhere else; snapshot it, add this stream, and recurse. + const childPath = new RefSet(resourceStreamPath); + childPath.put(oldRef); + + const promise = Promise.resolve().then(async () => { + const collected = await this.#collectDependencies( + stream, + true, + xref, + childPath + ); + + // A cycle already allocated a reference, so store the clone there. + const cycleRef = oldRefMapping.get(oldRef); + if (cycleRef) { + this.xref[cycleRef.num] = collected; + return cycleRef; + } + + const ref = await this.#dedupResourceStream(collected); + oldRefMapping.put(oldRef, ref); + return ref; + }); + resourceStreamPromises.set(key, promise); + try { + return await promise; + } finally { + if (resourceStreamPromises.get(key) === promise) { + resourceStreamPromises.delete(key); + } + } + } + + /** + * Return the reference for a cloned resource stream, reusing a byte-identical + * earlier copy or else allocating and registering a new one. + * @param {BaseStream} stream + * @returns {Promise} + */ + async #dedupResourceStream(stream) { + const dictStr = await this.#serializeDict(stream.dict); + const bytes = this.#rawStreamBytes(stream); + const key = this.#resourceStreamKey(dictStr, bytes); + + let bucket = this.#resourceStreamCache.get(key); + if (bucket) { + // Same key only means "maybe equal": confirm with an exact comparison. + for (const entry of bucket) { + if ( + entry.dictStr === dictStr && + isArrayEqual(this.#rawStreamBytes(entry.stream), bytes) + ) { + return entry.ref; + } + } + } else { + bucket = []; + this.#resourceStreamCache.set(key, bucket); + } + const ref = this.newRef; + this.xref[ref.num] = stream; + bucket.push({ ref, dictStr, stream }); + return ref; + } + async #cloneStructTreeNode( parentStructRef, node, diff --git a/src/core/writer.js b/src/core/writer.js index 63f5d195e..2a6a7f1d7 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -75,14 +75,17 @@ async function writeStream(stream, buffer, transform) { isName(filterZero, "JBIG2Decode") || isName(filterZero, "CCITTFaxDecode") || isName(filterZero, "LZWDecode"); + const isFilterZeroCompressedObject = + isFilterZeroFlateDecode || + isFilterZeroImageDecode || + isName(filterZero, "BrotliDecode"); // If the string is too small there is no real benefit in compressing it. // The number 256 is arbitrary, but it should be reasonable. const MIN_LENGTH_FOR_COMPRESSING = 256; if ( - !isFilterZeroFlateDecode && - !isFilterZeroImageDecode && + !isFilterZeroCompressedObject && bytes.length >= MIN_LENGTH_FOR_COMPRESSING ) { try { diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 08d4e60a9..3710ccda4 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -27,6 +27,7 @@ import { PasswordResponses, PermissionFlag, ResponseException, + stringToBytes, UnknownErrorException, } from "../../src/shared/util.js"; import { @@ -88,6 +89,57 @@ describe("api", function () { .join(""); } + function countMarker(bytes, marker) { + let count = 0; + for (let i = 0, ii = bytes.length - marker.length; i <= ii; i++) { + let j = 0; + while (j < marker.length && bytes[i + j] === marker.charCodeAt(j)) { + j++; + } + if (j === marker.length) { + count++; + } + } + return count; + } + + function buildSharedImageResourcePdf() { + const streamObject = (num, dict, data) => + `${num} 0 obj\n<< ${dict} /Length ${data.length} >>\n` + + `stream\n${data}\nendstream\nendobj\n`; + const objects = [ + "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n", + "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n", + "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 10 10] " + + "/Resources << /XObject << /Im0 4 0 R /Im1 4 0 R >> >> " + + "/Contents 5 0 R >>\nendobj\n", + streamObject( + 4, + "/Type /XObject /Subtype /Image /Width 1 /Height 1 " + + "/ColorSpace /DeviceRGB /BitsPerComponent 8 /Filter /ASCIIHexDecode", + "FF0000>" + ), + streamObject(5, "", "q 10 0 0 10 0 0 cm /Im0 Do Q"), + ]; + + let pdf = "%PDF-1.7\n"; + const offsets = []; + for (const obj of objects) { + offsets.push(pdf.length); + pdf += obj; + } + const xrefOffset = pdf.length; + pdf += `xref\n0 ${objects.length + 1}\n`; + pdf += "0000000000 65535 f \n"; + for (const offset of offsets) { + pdf += `${offset.toString().padStart(10, "0")} 00000 n \n`; + } + pdf += + `trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\n` + + `startxref\n${xrefOffset}\n%%EOF\n`; + return stringToBytes(pdf); + } + function getNamedNodeInXML(node, path) { for (const component of path.split(".")) { if (!node.childNodes) { @@ -5774,6 +5826,71 @@ small scripts as well as for`); await loadingTask.destroy(); }); + it("deduplicates resource streams shared across merged copies", async function () { + const MARKER = "/CIDFontType0C"; + + const loadingTask = getDocument( + buildGetDocumentParams("doc_1_3_pages.pdf") + ); + const pdfDoc = await loadingTask.promise; + // The same buffer loaded several times yields distinct worker-side + // documents, so this is a true cross-document merge of identical data. + const pdfData = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "doc_2_3_pages.pdf", + }); + + // Baseline: the font programs in one duplicate-free copy. + const dataSingle = await pdfDoc.extractPages([{ document: pdfData }]); + const fontsSingle = countMarker(dataSingle, MARKER); + expect(fontsSingle).toBeGreaterThan(0); + + // Four identical copies must share them (a naive merge would write 4x). + const COPIES = 4; + const dataMany = await pdfDoc.extractPages( + new Array(COPIES).fill(0).map(() => ({ document: pdfData })) + ); + expect(countMarker(dataMany, MARKER)).toEqual(fontsSingle); + + // The merged document must still be valid and render every page. + const newLoadingTask = getDocument({ data: dataMany }); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(3 * COPIES); + for (let i = 1; i <= 3 * COPIES; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual( + `Document 2:Page ${((i - 1) % 3) + 1}` + ); + } + await newLoadingTask.destroy(); + + await loadingTask.destroy(); + }); + + it("deduplicates resource streams reached concurrently", async function () { + const pdfData = buildSharedImageResourcePdf(); + const loadingTask = getDocument({ data: pdfData.slice() }); + const pdfDoc = await loadingTask.promise; + + // The source page has two XObject names pointing at the same image + // stream. Those references are cloned concurrently from the resource + // dictionary, and must not be mistaken for a reference cycle. + const data = await pdfDoc.extractPages([ + { document: pdfData.slice() }, + { document: pdfData.slice() }, + ]); + expect(countMarker(data, "/Subtype /Image")).toEqual(1); + + const newLoadingTask = getDocument({ data }); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(2); + await newPdfDoc.getPage(1); + await newPdfDoc.getPage(2); + await newLoadingTask.destroy(); + + await loadingTask.destroy(); + }); + it("should merge two PDFs with page included ranges", async function () { const loadingTask = getDocument( buildGetDocumentParams("tracemonkey.pdf")