diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index f1945f41b..2ed888c73 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -29,14 +29,15 @@ import { } from "../core_utils.js"; import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js"; import { incrementalUpdate, writeValue } from "../writer.js"; +import { isArrayEqual, stringToBytes } from "../../shared/util.js"; import { NameTree, NumberTree } from "../name_number_tree.js"; import { stringToAsciiOrUTF16BE, stringToPDFString } from "../string_utils.js"; import { AnnotationFactory } from "../annotation.js"; import { BaseStream } from "../base_stream.js"; import { createImage } from "./pdf_images.js"; import { LETTER_SIZE_MEDIABOX } from "../document.js"; +import { MurmurHash3_64 } from "../../shared/murmurhash3.js"; import { StringStream } from "../stream.js"; -import { stringToBytes } from "../../shared/util.js"; const MAX_LEAVES_PER_PAGES_NODE = 16; const MAX_IN_NAME_TREE_NODE = 64; @@ -63,6 +64,7 @@ class DocumentData { this.dedupNamedDestinations = new Map(); this.usedNamedDestinations = new Set(); this.postponedRefCopies = new RefSetCache(); + this.resourceStreamPromises = new Map(); this.usedStructParents = new Set(); this.oldStructParentMapping = new Map(); this.structTreeRoot = null; @@ -126,6 +128,12 @@ class PDFEditor { #primaryDocument = null; + // Deduplicates resource streams (fonts/images) shared across the merged + // documents. Maps a cheap content key to a bucket of { ref, dictStr, stream } + // candidates; the key only groups possible matches, an exact byte comparison + // decides, so a key collision can never alias two distinct resources. + #resourceStreamCache = new Map(); + currentDocument = null; oldPages = []; @@ -232,16 +240,22 @@ class PDFEditor { * @param {*} obj * @param {boolean} mustClone * @param {XRef} xref + * @param {RefSet} resourceStreamPath * @returns {Promise<*>} */ - async #collectDependencies(obj, mustClone, xref) { + async #collectDependencies( + obj, + mustClone, + xref, + resourceStreamPath = new RefSet() + ) { if (obj instanceof Ref) { const { currentDocument: { oldRefMapping }, } = this; - let newRef = oldRefMapping.get(obj); - if (newRef) { - return newRef; + const existingRef = oldRefMapping.get(obj); + if (existingRef) { + return existingRef; } const oldRef = obj; obj = await xref.fetchAsync(oldRef); @@ -250,7 +264,19 @@ class PDFEditor { return obj; } - newRef = this.newRef; + // Deduplicate fonts/images against earlier copies (common when merging + // exports of the same template). Reusing a copy costs no reference, so + // allocation is deferred to #collectResourceStream until it's known new. + if (obj instanceof BaseStream && this.#isResourceStream(obj.dict)) { + return this.#collectResourceStream( + oldRef, + obj, + xref, + resourceStreamPath + ); + } + + const newRef = this.newRef; oldRefMapping.put(oldRef, newRef); if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) { @@ -265,7 +291,12 @@ class PDFEditor { } } - this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref); + this.xref[newRef.num] = await this.#collectDependencies( + obj, + true, + xref, + resourceStreamPath + ); return newRef; } const promises = []; @@ -285,9 +316,12 @@ class PDFEditor { continue; } promises.push( - this.#collectDependencies(obj[i], true, xref).then( - newObj => (obj[i] = newObj) - ) + this.#collectDependencies( + obj[i], + true, + xref, + resourceStreamPath + ).then(newObj => (obj[i] = newObj)) ); } await Promise.all(promises); @@ -314,9 +348,12 @@ class PDFEditor { continue; } promises.push( - this.#collectDependencies(rawObj, true, xref).then(newObj => - dict.set(key, newObj) - ) + this.#collectDependencies( + rawObj, + true, + xref, + resourceStreamPath + ).then(newObj => dict.set(key, newObj)) ); } await Promise.all(promises); @@ -325,6 +362,175 @@ class PDFEditor { return obj; } + /** + * Whether a stream is worth deduplicating: an image or an embedded font + * program (large and often shared). Per-page content streams etc. are + * essentially never shared, so hashing them would be wasted work. + * @param {Dict} dict + * @returns {boolean} + */ + #isResourceStream(dict) { + const subtype = dict.get("Subtype"); + return ( + isName(subtype, "Image") || + // FontFile/FontFile2 carry Length1; FontFile3 has one of these Subtypes. + dict.has("Length1") || + isName(subtype, "Type1C") || + isName(subtype, "CIDFontType0C") || + isName(subtype, "OpenType") + ); + } + + /** + * Read the raw, still-encoded bytes of a stream. + * @param {BaseStream} stream + * @returns {Uint8Array} + */ + #rawStreamBytes(stream) { + const original = stream.getOriginalStream(); + original.reset(); + return original.getBytes(); + } + + /** + * Serialize a dictionary to a canonical string. Two clones of the same source + * dict serialize identically, so this works as a bucket key and as an exact + * comparison. + * @param {Dict} dict + * @returns {Promise} + */ + async #serializeDict(dict) { + const buffer = []; + await writeValue(dict, buffer, /* transform = */ null); + return buffer.join(""); + } + + /** + * Cheap bucket key for a resource stream: the serialized dict, the byte + * length, and a few sampled chunks (so large payloads aren't fully hashed). + * Collisions only group candidates that are then compared byte-for-byte, so + * they cost time but never cause a wrong merge. + * @param {string} dictStr + * @param {Uint8Array} bytes + * @returns {string} + */ + #resourceStreamKey(dictStr, bytes) { + const SAMPLE_SIZE = 256; + const SAMPLE_COUNT = 4; + const { length } = bytes; + const hash = new MurmurHash3_64(); + hash.update(dictStr); + hash.update(`#${length}`); + if (length <= SAMPLE_SIZE * SAMPLE_COUNT) { + hash.update(bytes); + } else { + const step = Math.floor((length - SAMPLE_SIZE) / (SAMPLE_COUNT - 1)); + for (let i = 0; i < SAMPLE_COUNT; i++) { + const start = Math.min(i * step, length - SAMPLE_SIZE); + hash.update(bytes.subarray(start, start + SAMPLE_SIZE)); + } + } + return hash.hexdigest(); + } + + /** + * Clone a resource stream and return its output reference, reusing an earlier + * copy when possible. The reference is allocated lazily (in + * #dedupResourceStream), so a reused resource leaves no unused reference. + * @param {Ref} oldRef + * @param {BaseStream} stream + * @param {XRef} xref + * @param {RefSet} resourceStreamPath + * @returns {Promise} + */ + async #collectResourceStream(oldRef, stream, xref, resourceStreamPath) { + const { + currentDocument: { oldRefMapping, resourceStreamPromises }, + } = this; + + // Re-entry means a (malformed) cycle back to this stream: allocate its + // reference now to break the loop, like the generic path's eager alloc. + if (resourceStreamPath.has(oldRef)) { + let ref = oldRefMapping.get(oldRef); + if (!ref) { + ref = this.newRef; + oldRefMapping.put(oldRef, ref); + } + return ref; + } + + const key = oldRef.toString(); + const pending = resourceStreamPromises.get(key); + if (pending) { + return pending; + } + + // The path only grows here, so the shared parent path can be passed + // read-only everywhere else; snapshot it, add this stream, and recurse. + const childPath = new RefSet(resourceStreamPath); + childPath.put(oldRef); + + const promise = Promise.resolve().then(async () => { + const collected = await this.#collectDependencies( + stream, + true, + xref, + childPath + ); + + // A cycle already allocated a reference, so store the clone there. + const cycleRef = oldRefMapping.get(oldRef); + if (cycleRef) { + this.xref[cycleRef.num] = collected; + return cycleRef; + } + + const ref = await this.#dedupResourceStream(collected); + oldRefMapping.put(oldRef, ref); + return ref; + }); + resourceStreamPromises.set(key, promise); + try { + return await promise; + } finally { + if (resourceStreamPromises.get(key) === promise) { + resourceStreamPromises.delete(key); + } + } + } + + /** + * Return the reference for a cloned resource stream, reusing a byte-identical + * earlier copy or else allocating and registering a new one. + * @param {BaseStream} stream + * @returns {Promise} + */ + async #dedupResourceStream(stream) { + const dictStr = await this.#serializeDict(stream.dict); + const bytes = this.#rawStreamBytes(stream); + const key = this.#resourceStreamKey(dictStr, bytes); + + let bucket = this.#resourceStreamCache.get(key); + if (bucket) { + // Same key only means "maybe equal": confirm with an exact comparison. + for (const entry of bucket) { + if ( + entry.dictStr === dictStr && + isArrayEqual(this.#rawStreamBytes(entry.stream), bytes) + ) { + return entry.ref; + } + } + } else { + bucket = []; + this.#resourceStreamCache.set(key, bucket); + } + const ref = this.newRef; + this.xref[ref.num] = stream; + bucket.push({ ref, dictStr, stream }); + return ref; + } + async #cloneStructTreeNode( parentStructRef, node, diff --git a/src/core/writer.js b/src/core/writer.js index 63f5d195e..2a6a7f1d7 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -75,14 +75,17 @@ async function writeStream(stream, buffer, transform) { isName(filterZero, "JBIG2Decode") || isName(filterZero, "CCITTFaxDecode") || isName(filterZero, "LZWDecode"); + const isFilterZeroCompressedObject = + isFilterZeroFlateDecode || + isFilterZeroImageDecode || + isName(filterZero, "BrotliDecode"); // If the string is too small there is no real benefit in compressing it. // The number 256 is arbitrary, but it should be reasonable. const MIN_LENGTH_FOR_COMPRESSING = 256; if ( - !isFilterZeroFlateDecode && - !isFilterZeroImageDecode && + !isFilterZeroCompressedObject && bytes.length >= MIN_LENGTH_FOR_COMPRESSING ) { try { diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index d2d2eb0ee..c31bc5797 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -27,6 +27,7 @@ import { PasswordResponses, PermissionFlag, ResponseException, + stringToBytes, UnknownErrorException, } from "../../src/shared/util.js"; import { @@ -88,6 +89,57 @@ describe("api", function () { .join(""); } + function countMarker(bytes, marker) { + let count = 0; + for (let i = 0, ii = bytes.length - marker.length; i <= ii; i++) { + let j = 0; + while (j < marker.length && bytes[i + j] === marker.charCodeAt(j)) { + j++; + } + if (j === marker.length) { + count++; + } + } + return count; + } + + function buildSharedImageResourcePdf() { + const streamObject = (num, dict, data) => + `${num} 0 obj\n<< ${dict} /Length ${data.length} >>\n` + + `stream\n${data}\nendstream\nendobj\n`; + const objects = [ + "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n", + "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n", + "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 10 10] " + + "/Resources << /XObject << /Im0 4 0 R /Im1 4 0 R >> >> " + + "/Contents 5 0 R >>\nendobj\n", + streamObject( + 4, + "/Type /XObject /Subtype /Image /Width 1 /Height 1 " + + "/ColorSpace /DeviceRGB /BitsPerComponent 8 /Filter /ASCIIHexDecode", + "FF0000>" + ), + streamObject(5, "", "q 10 0 0 10 0 0 cm /Im0 Do Q"), + ]; + + let pdf = "%PDF-1.7\n"; + const offsets = []; + for (const obj of objects) { + offsets.push(pdf.length); + pdf += obj; + } + const xrefOffset = pdf.length; + pdf += `xref\n0 ${objects.length + 1}\n`; + pdf += "0000000000 65535 f \n"; + for (const offset of offsets) { + pdf += `${offset.toString().padStart(10, "0")} 00000 n \n`; + } + pdf += + `trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\n` + + `startxref\n${xrefOffset}\n%%EOF\n`; + return stringToBytes(pdf); + } + function getNamedNodeInXML(node, path) { for (const component of path.split(".")) { if (!node.childNodes) { @@ -5873,6 +5925,71 @@ small scripts as well as for`); await loadingTask.destroy(); }); + it("deduplicates resource streams shared across merged copies", async function () { + const MARKER = "/CIDFontType0C"; + + const loadingTask = getDocument( + buildGetDocumentParams("doc_1_3_pages.pdf") + ); + const pdfDoc = await loadingTask.promise; + // The same buffer loaded several times yields distinct worker-side + // documents, so this is a true cross-document merge of identical data. + const pdfData = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "doc_2_3_pages.pdf", + }); + + // Baseline: the font programs in one duplicate-free copy. + const dataSingle = await pdfDoc.extractPages([{ document: pdfData }]); + const fontsSingle = countMarker(dataSingle, MARKER); + expect(fontsSingle).toBeGreaterThan(0); + + // Four identical copies must share them (a naive merge would write 4x). + const COPIES = 4; + const dataMany = await pdfDoc.extractPages( + new Array(COPIES).fill(0).map(() => ({ document: pdfData })) + ); + expect(countMarker(dataMany, MARKER)).toEqual(fontsSingle); + + // The merged document must still be valid and render every page. + const newLoadingTask = getDocument({ data: dataMany }); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(3 * COPIES); + for (let i = 1; i <= 3 * COPIES; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual( + `Document 2:Page ${((i - 1) % 3) + 1}` + ); + } + await newLoadingTask.destroy(); + + await loadingTask.destroy(); + }); + + it("deduplicates resource streams reached concurrently", async function () { + const pdfData = buildSharedImageResourcePdf(); + const loadingTask = getDocument({ data: pdfData.slice() }); + const pdfDoc = await loadingTask.promise; + + // The source page has two XObject names pointing at the same image + // stream. Those references are cloned concurrently from the resource + // dictionary, and must not be mistaken for a reference cycle. + const data = await pdfDoc.extractPages([ + { document: pdfData.slice() }, + { document: pdfData.slice() }, + ]); + expect(countMarker(data, "/Subtype /Image")).toEqual(1); + + const newLoadingTask = getDocument({ data }); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(2); + await newPdfDoc.getPage(1); + await newPdfDoc.getPage(2); + await newLoadingTask.destroy(); + + await loadingTask.destroy(); + }); + it("should merge two PDFs with page included ranges", async function () { const loadingTask = getDocument( buildGetDocumentParams("tracemonkey.pdf")