Deduplicate shared font/image streams when merging PDFs

Identical embedded fonts and images across the merged documents are now written once and shared, instead of being copied per source file. And avoid to compress already compressed stream with Brotli.
2026-07-21 06:27:23 +02:00 · 2026-06-02 22:08:16 +02:00 · 2026-06-02 22:08:16 +02:00 · 1a7821ab13
commit 1a7821ab13
parent b43ef1c746
3 changed files with 341 additions and 15 deletions
--- a/src/core/editor/pdf_editor.js
+++ b/src/core/editor/pdf_editor.js
@ -29,14 +29,15 @@ import {
 } from "../core_utils.js";
 import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js";
 import { incrementalUpdate, writeValue } from "../writer.js";
+import { isArrayEqual, stringToBytes } from "../../shared/util.js";
 import { NameTree, NumberTree } from "../name_number_tree.js";
 import { stringToAsciiOrUTF16BE, stringToPDFString } from "../string_utils.js";
 import { AnnotationFactory } from "../annotation.js";
 import { BaseStream } from "../base_stream.js";
 import { createImage } from "./pdf_images.js";
 import { LETTER_SIZE_MEDIABOX } from "../document.js";
+import { MurmurHash3_64 } from "../../shared/murmurhash3.js";
 import { StringStream } from "../stream.js";
-import { stringToBytes } from "../../shared/util.js";

 const MAX_LEAVES_PER_PAGES_NODE = 16;
 const MAX_IN_NAME_TREE_NODE = 64;
@ -63,6 +64,7 @@ class DocumentData {
    this.dedupNamedDestinations = new Map();
    this.usedNamedDestinations = new Set();
    this.postponedRefCopies = new RefSetCache();
+    this.resourceStreamPromises = new Map();
    this.usedStructParents = new Set();
    this.oldStructParentMapping = new Map();
    this.structTreeRoot = null;
@ -126,6 +128,12 @@ class PDFEditor {

  #primaryDocument = null;

+  // Deduplicates resource streams (fonts/images) shared across the merged
+  // documents. Maps a cheap content key to a bucket of { ref, dictStr, stream }
+  // candidates; the key only groups possible matches, an exact byte comparison
+  // decides, so a key collision can never alias two distinct resources.
+  #resourceStreamCache = new Map();
+
  currentDocument = null;

  oldPages = [];
@ -232,16 +240,22 @@ class PDFEditor {
   * @param {*} obj
   * @param {boolean} mustClone
   * @param {XRef} xref
+   * @param {RefSet} resourceStreamPath
   * @returns {Promise<*>}
   */
-  async #collectDependencies(obj, mustClone, xref) {
+  async #collectDependencies(
+    obj,
+    mustClone,
+    xref,
+    resourceStreamPath = new RefSet()
+  ) {
    if (obj instanceof Ref) {
      const {
        currentDocument: { oldRefMapping },
      } = this;
-      let newRef = oldRefMapping.get(obj);
-      if (newRef) {
-        return newRef;
+      const existingRef = oldRefMapping.get(obj);
+      if (existingRef) {
+        return existingRef;
      }
      const oldRef = obj;
      obj = await xref.fetchAsync(oldRef);
@ -250,7 +264,19 @@ class PDFEditor {
        return obj;
      }

-      newRef = this.newRef;
+      // Deduplicate fonts/images against earlier copies (common when merging
+      // exports of the same template). Reusing a copy costs no reference, so
+      // allocation is deferred to #collectResourceStream until it's known new.
+      if (obj instanceof BaseStream && this.#isResourceStream(obj.dict)) {
+        return this.#collectResourceStream(
+          oldRef,
+          obj,
+          xref,
+          resourceStreamPath
+        );
+      }
+
+      const newRef = this.newRef;
      oldRefMapping.put(oldRef, newRef);

      if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
@ -265,7 +291,12 @@ class PDFEditor {
        }
      }

-      this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref);
+      this.xref[newRef.num] = await this.#collectDependencies(
+        obj,
+        true,
+        xref,
+        resourceStreamPath
+      );
      return newRef;
    }
    const promises = [];
@ -285,9 +316,12 @@ class PDFEditor {
          continue;
        }
        promises.push(
-          this.#collectDependencies(obj[i], true, xref).then(
-            newObj => (obj[i] = newObj)
-          )
+          this.#collectDependencies(
+            obj[i],
+            true,
+            xref,
+            resourceStreamPath
+          ).then(newObj => (obj[i] = newObj))
        );
      }
      await Promise.all(promises);
@ -314,9 +348,12 @@ class PDFEditor {
          continue;
        }
        promises.push(
-          this.#collectDependencies(rawObj, true, xref).then(newObj =>
-            dict.set(key, newObj)
-          )
+          this.#collectDependencies(
+            rawObj,
+            true,
+            xref,
+            resourceStreamPath
+          ).then(newObj => dict.set(key, newObj))
        );
      }
      await Promise.all(promises);
@ -325,6 +362,175 @@ class PDFEditor {
    return obj;
  }

+  /**
+   * Whether a stream is worth deduplicating: an image or an embedded font
+   * program (large and often shared). Per-page content streams etc. are
+   * essentially never shared, so hashing them would be wasted work.
+   * @param {Dict} dict
+   * @returns {boolean}
+   */
+  #isResourceStream(dict) {
+    const subtype = dict.get("Subtype");
+    return (
+      isName(subtype, "Image") ||
+      // FontFile/FontFile2 carry Length1; FontFile3 has one of these Subtypes.
+      dict.has("Length1") ||
+      isName(subtype, "Type1C") ||
+      isName(subtype, "CIDFontType0C") ||
+      isName(subtype, "OpenType")
+    );
+  }
+
+  /**
+   * Read the raw, still-encoded bytes of a stream.
+   * @param {BaseStream} stream
+   * @returns {Uint8Array}
+   */
+  #rawStreamBytes(stream) {
+    const original = stream.getOriginalStream();
+    original.reset();
+    return original.getBytes();
+  }
+
+  /**
+   * Serialize a dictionary to a canonical string. Two clones of the same source
+   * dict serialize identically, so this works as a bucket key and as an exact
+   * comparison.
+   * @param {Dict} dict
+   * @returns {Promise<string>}
+   */
+  async #serializeDict(dict) {
+    const buffer = [];
+    await writeValue(dict, buffer, /* transform = */ null);
+    return buffer.join("");
+  }
+
+  /**
+   * Cheap bucket key for a resource stream: the serialized dict, the byte
+   * length, and a few sampled chunks (so large payloads aren't fully hashed).
+   * Collisions only group candidates that are then compared byte-for-byte, so
+   * they cost time but never cause a wrong merge.
+   * @param {string} dictStr
+   * @param {Uint8Array} bytes
+   * @returns {string}
+   */
+  #resourceStreamKey(dictStr, bytes) {
+    const SAMPLE_SIZE = 256;
+    const SAMPLE_COUNT = 4;
+    const { length } = bytes;
+    const hash = new MurmurHash3_64();
+    hash.update(dictStr);
+    hash.update(`#${length}`);
+    if (length <= SAMPLE_SIZE * SAMPLE_COUNT) {
+      hash.update(bytes);
+    } else {
+      const step = Math.floor((length - SAMPLE_SIZE) / (SAMPLE_COUNT - 1));
+      for (let i = 0; i < SAMPLE_COUNT; i++) {
+        const start = Math.min(i * step, length - SAMPLE_SIZE);
+        hash.update(bytes.subarray(start, start + SAMPLE_SIZE));
+      }
+    }
+    return hash.hexdigest();
+  }
+
+  /**
+   * Clone a resource stream and return its output reference, reusing an earlier
+   * copy when possible. The reference is allocated lazily (in
+   * #dedupResourceStream), so a reused resource leaves no unused reference.
+   * @param {Ref} oldRef
+   * @param {BaseStream} stream
+   * @param {XRef} xref
+   * @param {RefSet} resourceStreamPath
+   * @returns {Promise<Ref>}
+   */
+  async #collectResourceStream(oldRef, stream, xref, resourceStreamPath) {
+    const {
+      currentDocument: { oldRefMapping, resourceStreamPromises },
+    } = this;
+
+    // Re-entry means a (malformed) cycle back to this stream: allocate its
+    // reference now to break the loop, like the generic path's eager alloc.
+    if (resourceStreamPath.has(oldRef)) {
+      let ref = oldRefMapping.get(oldRef);
+      if (!ref) {
+        ref = this.newRef;
+        oldRefMapping.put(oldRef, ref);
+      }
+      return ref;
+    }
+
+    const key = oldRef.toString();
+    const pending = resourceStreamPromises.get(key);
+    if (pending) {
+      return pending;
+    }
+
+    // The path only grows here, so the shared parent path can be passed
+    // read-only everywhere else; snapshot it, add this stream, and recurse.
+    const childPath = new RefSet(resourceStreamPath);
+    childPath.put(oldRef);
+
+    const promise = Promise.resolve().then(async () => {
+      const collected = await this.#collectDependencies(
+        stream,
+        true,
+        xref,
+        childPath
+      );
+
+      // A cycle already allocated a reference, so store the clone there.
+      const cycleRef = oldRefMapping.get(oldRef);
+      if (cycleRef) {
+        this.xref[cycleRef.num] = collected;
+        return cycleRef;
+      }
+
+      const ref = await this.#dedupResourceStream(collected);
+      oldRefMapping.put(oldRef, ref);
+      return ref;
+    });
+    resourceStreamPromises.set(key, promise);
+    try {
+      return await promise;
+    } finally {
+      if (resourceStreamPromises.get(key) === promise) {
+        resourceStreamPromises.delete(key);
+      }
+    }
+  }
+
+  /**
+   * Return the reference for a cloned resource stream, reusing a byte-identical
+   * earlier copy or else allocating and registering a new one.
+   * @param {BaseStream} stream
+   * @returns {Promise<Ref>}
+   */
+  async #dedupResourceStream(stream) {
+    const dictStr = await this.#serializeDict(stream.dict);
+    const bytes = this.#rawStreamBytes(stream);
+    const key = this.#resourceStreamKey(dictStr, bytes);
+
+    let bucket = this.#resourceStreamCache.get(key);
+    if (bucket) {
+      // Same key only means "maybe equal": confirm with an exact comparison.
+      for (const entry of bucket) {
+        if (
+          entry.dictStr === dictStr &&
+          isArrayEqual(this.#rawStreamBytes(entry.stream), bytes)
+        ) {
+          return entry.ref;
+        }
+      }
+    } else {
+      bucket = [];
+      this.#resourceStreamCache.set(key, bucket);
+    }
+    const ref = this.newRef;
+    this.xref[ref.num] = stream;
+    bucket.push({ ref, dictStr, stream });
+    return ref;
+  }
+
  async #cloneStructTreeNode(
    parentStructRef,
    node,
--- a/src/core/writer.js
+++ b/src/core/writer.js
@ -75,14 +75,17 @@ async function writeStream(stream, buffer, transform) {
    isName(filterZero, "JBIG2Decode") ||
    isName(filterZero, "CCITTFaxDecode") ||
    isName(filterZero, "LZWDecode");
+  const isFilterZeroCompressedObject =
+    isFilterZeroFlateDecode ||
+    isFilterZeroImageDecode ||
+    isName(filterZero, "BrotliDecode");

  // If the string is too small there is no real benefit in compressing it.
  // The number 256 is arbitrary, but it should be reasonable.
  const MIN_LENGTH_FOR_COMPRESSING = 256;

  if (
-    !isFilterZeroFlateDecode &&
-    !isFilterZeroImageDecode &&
+    !isFilterZeroCompressedObject &&
    bytes.length >= MIN_LENGTH_FOR_COMPRESSING
  ) {
    try {
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -27,6 +27,7 @@ import {
  PasswordResponses,
  PermissionFlag,
  ResponseException,
+  stringToBytes,
  UnknownErrorException,
 } from "../../src/shared/util.js";
 import {
@ -88,6 +89,57 @@ describe("api", function () {
      .join("");
  }

+  function countMarker(bytes, marker) {
+    let count = 0;
+    for (let i = 0, ii = bytes.length - marker.length; i <= ii; i++) {
+      let j = 0;
+      while (j < marker.length && bytes[i + j] === marker.charCodeAt(j)) {
+        j++;
+      }
+      if (j === marker.length) {
+        count++;
+      }
+    }
+    return count;
+  }
+
+  function buildSharedImageResourcePdf() {
+    const streamObject = (num, dict, data) =>
+      `${num} 0 obj\n<< ${dict} /Length ${data.length} >>\n` +
+      `stream\n${data}\nendstream\nendobj\n`;
+    const objects = [
+      "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
+      "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
+      "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 10 10] " +
+        "/Resources << /XObject << /Im0 4 0 R /Im1 4 0 R >> >> " +
+        "/Contents 5 0 R >>\nendobj\n",
+      streamObject(
+        4,
+        "/Type /XObject /Subtype /Image /Width 1 /Height 1 " +
+          "/ColorSpace /DeviceRGB /BitsPerComponent 8 /Filter /ASCIIHexDecode",
+        "FF0000>"
+      ),
+      streamObject(5, "", "q 10 0 0 10 0 0 cm /Im0 Do Q"),
+    ];
+
+    let pdf = "%PDF-1.7\n";
+    const offsets = [];
+    for (const obj of objects) {
+      offsets.push(pdf.length);
+      pdf += obj;
+    }
+    const xrefOffset = pdf.length;
+    pdf += `xref\n0 ${objects.length + 1}\n`;
+    pdf += "0000000000 65535 f \n";
+    for (const offset of offsets) {
+      pdf += `${offset.toString().padStart(10, "0")} 00000 n \n`;
+    }
+    pdf +=
+      `trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\n` +
+      `startxref\n${xrefOffset}\n%%EOF\n`;
+    return stringToBytes(pdf);
+  }
+
  function getNamedNodeInXML(node, path) {
    for (const component of path.split(".")) {
      if (!node.childNodes) {
@ -5774,6 +5826,71 @@ small scripts as well as for`);
        await loadingTask.destroy();
      });

+      it("deduplicates resource streams shared across merged copies", async function () {
+        const MARKER = "/CIDFontType0C";
+
+        const loadingTask = getDocument(
+          buildGetDocumentParams("doc_1_3_pages.pdf")
+        );
+        const pdfDoc = await loadingTask.promise;
+        // The same buffer loaded several times yields distinct worker-side
+        // documents, so this is a true cross-document merge of identical data.
+        const pdfData = await DefaultFileReaderFactory.fetch({
+          path: TEST_PDFS_PATH + "doc_2_3_pages.pdf",
+        });
+
+        // Baseline: the font programs in one duplicate-free copy.
+        const dataSingle = await pdfDoc.extractPages([{ document: pdfData }]);
+        const fontsSingle = countMarker(dataSingle, MARKER);
+        expect(fontsSingle).toBeGreaterThan(0);
+
+        // Four identical copies must share them (a naive merge would write 4x).
+        const COPIES = 4;
+        const dataMany = await pdfDoc.extractPages(
+          new Array(COPIES).fill(0).map(() => ({ document: pdfData }))
+        );
+        expect(countMarker(dataMany, MARKER)).toEqual(fontsSingle);
+
+        // The merged document must still be valid and render every page.
+        const newLoadingTask = getDocument({ data: dataMany });
+        const newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(3 * COPIES);
+        for (let i = 1; i <= 3 * COPIES; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(
+            `Document 2:Page ${((i - 1) % 3) + 1}`
+          );
+        }
+        await newLoadingTask.destroy();
+
+        await loadingTask.destroy();
+      });
+
+      it("deduplicates resource streams reached concurrently", async function () {
+        const pdfData = buildSharedImageResourcePdf();
+        const loadingTask = getDocument({ data: pdfData.slice() });
+        const pdfDoc = await loadingTask.promise;
+
+        // The source page has two XObject names pointing at the same image
+        // stream. Those references are cloned concurrently from the resource
+        // dictionary, and must not be mistaken for a reference cycle.
+        const data = await pdfDoc.extractPages([
+          { document: pdfData.slice() },
+          { document: pdfData.slice() },
+        ]);
+        expect(countMarker(data, "/Subtype /Image")).toEqual(1);
+
+        const newLoadingTask = getDocument({ data });
+        const newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(2);
+        await newPdfDoc.getPage(1);
+        await newPdfDoc.getPage(2);
+        await newLoadingTask.destroy();
+
+        await loadingTask.destroy();
+      });
+
      it("should merge two PDFs with page included ranges", async function () {
        const loadingTask = getDocument(
          buildGetDocumentParams("tracemonkey.pdf")