mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-06-05 01:31:00 +02:00
Merge pull request #21379 from calixteman/dedup_stream_merging
Deduplicate shared font/image streams when merging PDFs
This commit is contained in:
commit
23ea0810d9
@ -29,14 +29,15 @@ import {
|
||||
} from "../core_utils.js";
|
||||
import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js";
|
||||
import { incrementalUpdate, writeValue } from "../writer.js";
|
||||
import { isArrayEqual, stringToBytes } from "../../shared/util.js";
|
||||
import { NameTree, NumberTree } from "../name_number_tree.js";
|
||||
import { stringToAsciiOrUTF16BE, stringToPDFString } from "../string_utils.js";
|
||||
import { AnnotationFactory } from "../annotation.js";
|
||||
import { BaseStream } from "../base_stream.js";
|
||||
import { createImage } from "./pdf_images.js";
|
||||
import { LETTER_SIZE_MEDIABOX } from "../document.js";
|
||||
import { MurmurHash3_64 } from "../../shared/murmurhash3.js";
|
||||
import { StringStream } from "../stream.js";
|
||||
import { stringToBytes } from "../../shared/util.js";
|
||||
|
||||
const MAX_LEAVES_PER_PAGES_NODE = 16;
|
||||
const MAX_IN_NAME_TREE_NODE = 64;
|
||||
@ -63,6 +64,7 @@ class DocumentData {
|
||||
this.dedupNamedDestinations = new Map();
|
||||
this.usedNamedDestinations = new Set();
|
||||
this.postponedRefCopies = new RefSetCache();
|
||||
this.resourceStreamPromises = new Map();
|
||||
this.usedStructParents = new Set();
|
||||
this.oldStructParentMapping = new Map();
|
||||
this.structTreeRoot = null;
|
||||
@ -126,6 +128,12 @@ class PDFEditor {
|
||||
|
||||
#primaryDocument = null;
|
||||
|
||||
// Deduplicates resource streams (fonts/images) shared across the merged
|
||||
// documents. Maps a cheap content key to a bucket of { ref, dictStr, stream }
|
||||
// candidates; the key only groups possible matches, an exact byte comparison
|
||||
// decides, so a key collision can never alias two distinct resources.
|
||||
#resourceStreamCache = new Map();
|
||||
|
||||
currentDocument = null;
|
||||
|
||||
oldPages = [];
|
||||
@ -232,16 +240,22 @@ class PDFEditor {
|
||||
* @param {*} obj
|
||||
* @param {boolean} mustClone
|
||||
* @param {XRef} xref
|
||||
* @param {RefSet} resourceStreamPath
|
||||
* @returns {Promise<*>}
|
||||
*/
|
||||
async #collectDependencies(obj, mustClone, xref) {
|
||||
async #collectDependencies(
|
||||
obj,
|
||||
mustClone,
|
||||
xref,
|
||||
resourceStreamPath = new RefSet()
|
||||
) {
|
||||
if (obj instanceof Ref) {
|
||||
const {
|
||||
currentDocument: { oldRefMapping },
|
||||
} = this;
|
||||
let newRef = oldRefMapping.get(obj);
|
||||
if (newRef) {
|
||||
return newRef;
|
||||
const existingRef = oldRefMapping.get(obj);
|
||||
if (existingRef) {
|
||||
return existingRef;
|
||||
}
|
||||
const oldRef = obj;
|
||||
obj = await xref.fetchAsync(oldRef);
|
||||
@ -250,7 +264,19 @@ class PDFEditor {
|
||||
return obj;
|
||||
}
|
||||
|
||||
newRef = this.newRef;
|
||||
// Deduplicate fonts/images against earlier copies (common when merging
|
||||
// exports of the same template). Reusing a copy costs no reference, so
|
||||
// allocation is deferred to #collectResourceStream until it's known new.
|
||||
if (obj instanceof BaseStream && this.#isResourceStream(obj.dict)) {
|
||||
return this.#collectResourceStream(
|
||||
oldRef,
|
||||
obj,
|
||||
xref,
|
||||
resourceStreamPath
|
||||
);
|
||||
}
|
||||
|
||||
const newRef = this.newRef;
|
||||
oldRefMapping.put(oldRef, newRef);
|
||||
|
||||
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
|
||||
@ -265,7 +291,12 @@ class PDFEditor {
|
||||
}
|
||||
}
|
||||
|
||||
this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref);
|
||||
this.xref[newRef.num] = await this.#collectDependencies(
|
||||
obj,
|
||||
true,
|
||||
xref,
|
||||
resourceStreamPath
|
||||
);
|
||||
return newRef;
|
||||
}
|
||||
const promises = [];
|
||||
@ -285,9 +316,12 @@ class PDFEditor {
|
||||
continue;
|
||||
}
|
||||
promises.push(
|
||||
this.#collectDependencies(obj[i], true, xref).then(
|
||||
newObj => (obj[i] = newObj)
|
||||
)
|
||||
this.#collectDependencies(
|
||||
obj[i],
|
||||
true,
|
||||
xref,
|
||||
resourceStreamPath
|
||||
).then(newObj => (obj[i] = newObj))
|
||||
);
|
||||
}
|
||||
await Promise.all(promises);
|
||||
@ -314,9 +348,12 @@ class PDFEditor {
|
||||
continue;
|
||||
}
|
||||
promises.push(
|
||||
this.#collectDependencies(rawObj, true, xref).then(newObj =>
|
||||
dict.set(key, newObj)
|
||||
)
|
||||
this.#collectDependencies(
|
||||
rawObj,
|
||||
true,
|
||||
xref,
|
||||
resourceStreamPath
|
||||
).then(newObj => dict.set(key, newObj))
|
||||
);
|
||||
}
|
||||
await Promise.all(promises);
|
||||
@ -325,6 +362,175 @@ class PDFEditor {
|
||||
return obj;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether a stream is worth deduplicating: an image or an embedded font
|
||||
* program (large and often shared). Per-page content streams etc. are
|
||||
* essentially never shared, so hashing them would be wasted work.
|
||||
* @param {Dict} dict
|
||||
* @returns {boolean}
|
||||
*/
|
||||
#isResourceStream(dict) {
|
||||
const subtype = dict.get("Subtype");
|
||||
return (
|
||||
isName(subtype, "Image") ||
|
||||
// FontFile/FontFile2 carry Length1; FontFile3 has one of these Subtypes.
|
||||
dict.has("Length1") ||
|
||||
isName(subtype, "Type1C") ||
|
||||
isName(subtype, "CIDFontType0C") ||
|
||||
isName(subtype, "OpenType")
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the raw, still-encoded bytes of a stream.
|
||||
* @param {BaseStream} stream
|
||||
* @returns {Uint8Array}
|
||||
*/
|
||||
#rawStreamBytes(stream) {
|
||||
const original = stream.getOriginalStream();
|
||||
original.reset();
|
||||
return original.getBytes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize a dictionary to a canonical string. Two clones of the same source
|
||||
* dict serialize identically, so this works as a bucket key and as an exact
|
||||
* comparison.
|
||||
* @param {Dict} dict
|
||||
* @returns {Promise<string>}
|
||||
*/
|
||||
async #serializeDict(dict) {
|
||||
const buffer = [];
|
||||
await writeValue(dict, buffer, /* transform = */ null);
|
||||
return buffer.join("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Cheap bucket key for a resource stream: the serialized dict, the byte
|
||||
* length, and a few sampled chunks (so large payloads aren't fully hashed).
|
||||
* Collisions only group candidates that are then compared byte-for-byte, so
|
||||
* they cost time but never cause a wrong merge.
|
||||
* @param {string} dictStr
|
||||
* @param {Uint8Array} bytes
|
||||
* @returns {string}
|
||||
*/
|
||||
#resourceStreamKey(dictStr, bytes) {
|
||||
const SAMPLE_SIZE = 256;
|
||||
const SAMPLE_COUNT = 4;
|
||||
const { length } = bytes;
|
||||
const hash = new MurmurHash3_64();
|
||||
hash.update(dictStr);
|
||||
hash.update(`#${length}`);
|
||||
if (length <= SAMPLE_SIZE * SAMPLE_COUNT) {
|
||||
hash.update(bytes);
|
||||
} else {
|
||||
const step = Math.floor((length - SAMPLE_SIZE) / (SAMPLE_COUNT - 1));
|
||||
for (let i = 0; i < SAMPLE_COUNT; i++) {
|
||||
const start = Math.min(i * step, length - SAMPLE_SIZE);
|
||||
hash.update(bytes.subarray(start, start + SAMPLE_SIZE));
|
||||
}
|
||||
}
|
||||
return hash.hexdigest();
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone a resource stream and return its output reference, reusing an earlier
|
||||
* copy when possible. The reference is allocated lazily (in
|
||||
* #dedupResourceStream), so a reused resource leaves no unused reference.
|
||||
* @param {Ref} oldRef
|
||||
* @param {BaseStream} stream
|
||||
* @param {XRef} xref
|
||||
* @param {RefSet} resourceStreamPath
|
||||
* @returns {Promise<Ref>}
|
||||
*/
|
||||
async #collectResourceStream(oldRef, stream, xref, resourceStreamPath) {
|
||||
const {
|
||||
currentDocument: { oldRefMapping, resourceStreamPromises },
|
||||
} = this;
|
||||
|
||||
// Re-entry means a (malformed) cycle back to this stream: allocate its
|
||||
// reference now to break the loop, like the generic path's eager alloc.
|
||||
if (resourceStreamPath.has(oldRef)) {
|
||||
let ref = oldRefMapping.get(oldRef);
|
||||
if (!ref) {
|
||||
ref = this.newRef;
|
||||
oldRefMapping.put(oldRef, ref);
|
||||
}
|
||||
return ref;
|
||||
}
|
||||
|
||||
const key = oldRef.toString();
|
||||
const pending = resourceStreamPromises.get(key);
|
||||
if (pending) {
|
||||
return pending;
|
||||
}
|
||||
|
||||
// The path only grows here, so the shared parent path can be passed
|
||||
// read-only everywhere else; snapshot it, add this stream, and recurse.
|
||||
const childPath = new RefSet(resourceStreamPath);
|
||||
childPath.put(oldRef);
|
||||
|
||||
const promise = Promise.resolve().then(async () => {
|
||||
const collected = await this.#collectDependencies(
|
||||
stream,
|
||||
true,
|
||||
xref,
|
||||
childPath
|
||||
);
|
||||
|
||||
// A cycle already allocated a reference, so store the clone there.
|
||||
const cycleRef = oldRefMapping.get(oldRef);
|
||||
if (cycleRef) {
|
||||
this.xref[cycleRef.num] = collected;
|
||||
return cycleRef;
|
||||
}
|
||||
|
||||
const ref = await this.#dedupResourceStream(collected);
|
||||
oldRefMapping.put(oldRef, ref);
|
||||
return ref;
|
||||
});
|
||||
resourceStreamPromises.set(key, promise);
|
||||
try {
|
||||
return await promise;
|
||||
} finally {
|
||||
if (resourceStreamPromises.get(key) === promise) {
|
||||
resourceStreamPromises.delete(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the reference for a cloned resource stream, reusing a byte-identical
|
||||
* earlier copy or else allocating and registering a new one.
|
||||
* @param {BaseStream} stream
|
||||
* @returns {Promise<Ref>}
|
||||
*/
|
||||
async #dedupResourceStream(stream) {
|
||||
const dictStr = await this.#serializeDict(stream.dict);
|
||||
const bytes = this.#rawStreamBytes(stream);
|
||||
const key = this.#resourceStreamKey(dictStr, bytes);
|
||||
|
||||
let bucket = this.#resourceStreamCache.get(key);
|
||||
if (bucket) {
|
||||
// Same key only means "maybe equal": confirm with an exact comparison.
|
||||
for (const entry of bucket) {
|
||||
if (
|
||||
entry.dictStr === dictStr &&
|
||||
isArrayEqual(this.#rawStreamBytes(entry.stream), bytes)
|
||||
) {
|
||||
return entry.ref;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bucket = [];
|
||||
this.#resourceStreamCache.set(key, bucket);
|
||||
}
|
||||
const ref = this.newRef;
|
||||
this.xref[ref.num] = stream;
|
||||
bucket.push({ ref, dictStr, stream });
|
||||
return ref;
|
||||
}
|
||||
|
||||
async #cloneStructTreeNode(
|
||||
parentStructRef,
|
||||
node,
|
||||
|
||||
@ -75,14 +75,17 @@ async function writeStream(stream, buffer, transform) {
|
||||
isName(filterZero, "JBIG2Decode") ||
|
||||
isName(filterZero, "CCITTFaxDecode") ||
|
||||
isName(filterZero, "LZWDecode");
|
||||
const isFilterZeroCompressedObject =
|
||||
isFilterZeroFlateDecode ||
|
||||
isFilterZeroImageDecode ||
|
||||
isName(filterZero, "BrotliDecode");
|
||||
|
||||
// If the string is too small there is no real benefit in compressing it.
|
||||
// The number 256 is arbitrary, but it should be reasonable.
|
||||
const MIN_LENGTH_FOR_COMPRESSING = 256;
|
||||
|
||||
if (
|
||||
!isFilterZeroFlateDecode &&
|
||||
!isFilterZeroImageDecode &&
|
||||
!isFilterZeroCompressedObject &&
|
||||
bytes.length >= MIN_LENGTH_FOR_COMPRESSING
|
||||
) {
|
||||
try {
|
||||
|
||||
@ -27,6 +27,7 @@ import {
|
||||
PasswordResponses,
|
||||
PermissionFlag,
|
||||
ResponseException,
|
||||
stringToBytes,
|
||||
UnknownErrorException,
|
||||
} from "../../src/shared/util.js";
|
||||
import {
|
||||
@ -88,6 +89,57 @@ describe("api", function () {
|
||||
.join("");
|
||||
}
|
||||
|
||||
function countMarker(bytes, marker) {
|
||||
let count = 0;
|
||||
for (let i = 0, ii = bytes.length - marker.length; i <= ii; i++) {
|
||||
let j = 0;
|
||||
while (j < marker.length && bytes[i + j] === marker.charCodeAt(j)) {
|
||||
j++;
|
||||
}
|
||||
if (j === marker.length) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
function buildSharedImageResourcePdf() {
|
||||
const streamObject = (num, dict, data) =>
|
||||
`${num} 0 obj\n<< ${dict} /Length ${data.length} >>\n` +
|
||||
`stream\n${data}\nendstream\nendobj\n`;
|
||||
const objects = [
|
||||
"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
|
||||
"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
|
||||
"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 10 10] " +
|
||||
"/Resources << /XObject << /Im0 4 0 R /Im1 4 0 R >> >> " +
|
||||
"/Contents 5 0 R >>\nendobj\n",
|
||||
streamObject(
|
||||
4,
|
||||
"/Type /XObject /Subtype /Image /Width 1 /Height 1 " +
|
||||
"/ColorSpace /DeviceRGB /BitsPerComponent 8 /Filter /ASCIIHexDecode",
|
||||
"FF0000>"
|
||||
),
|
||||
streamObject(5, "", "q 10 0 0 10 0 0 cm /Im0 Do Q"),
|
||||
];
|
||||
|
||||
let pdf = "%PDF-1.7\n";
|
||||
const offsets = [];
|
||||
for (const obj of objects) {
|
||||
offsets.push(pdf.length);
|
||||
pdf += obj;
|
||||
}
|
||||
const xrefOffset = pdf.length;
|
||||
pdf += `xref\n0 ${objects.length + 1}\n`;
|
||||
pdf += "0000000000 65535 f \n";
|
||||
for (const offset of offsets) {
|
||||
pdf += `${offset.toString().padStart(10, "0")} 00000 n \n`;
|
||||
}
|
||||
pdf +=
|
||||
`trailer\n<< /Size ${objects.length + 1} /Root 1 0 R >>\n` +
|
||||
`startxref\n${xrefOffset}\n%%EOF\n`;
|
||||
return stringToBytes(pdf);
|
||||
}
|
||||
|
||||
function getNamedNodeInXML(node, path) {
|
||||
for (const component of path.split(".")) {
|
||||
if (!node.childNodes) {
|
||||
@ -5873,6 +5925,71 @@ small scripts as well as for`);
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("deduplicates resource streams shared across merged copies", async function () {
|
||||
const MARKER = "/CIDFontType0C";
|
||||
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("doc_1_3_pages.pdf")
|
||||
);
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
// The same buffer loaded several times yields distinct worker-side
|
||||
// documents, so this is a true cross-document merge of identical data.
|
||||
const pdfData = await DefaultFileReaderFactory.fetch({
|
||||
path: TEST_PDFS_PATH + "doc_2_3_pages.pdf",
|
||||
});
|
||||
|
||||
// Baseline: the font programs in one duplicate-free copy.
|
||||
const dataSingle = await pdfDoc.extractPages([{ document: pdfData }]);
|
||||
const fontsSingle = countMarker(dataSingle, MARKER);
|
||||
expect(fontsSingle).toBeGreaterThan(0);
|
||||
|
||||
// Four identical copies must share them (a naive merge would write 4x).
|
||||
const COPIES = 4;
|
||||
const dataMany = await pdfDoc.extractPages(
|
||||
new Array(COPIES).fill(0).map(() => ({ document: pdfData }))
|
||||
);
|
||||
expect(countMarker(dataMany, MARKER)).toEqual(fontsSingle);
|
||||
|
||||
// The merged document must still be valid and render every page.
|
||||
const newLoadingTask = getDocument({ data: dataMany });
|
||||
const newPdfDoc = await newLoadingTask.promise;
|
||||
expect(newPdfDoc.numPages).toEqual(3 * COPIES);
|
||||
for (let i = 1; i <= 3 * COPIES; i++) {
|
||||
const pdfPage = await newPdfDoc.getPage(i);
|
||||
const { items: textItems } = await pdfPage.getTextContent();
|
||||
expect(mergeText(textItems)).toEqual(
|
||||
`Document 2:Page ${((i - 1) % 3) + 1}`
|
||||
);
|
||||
}
|
||||
await newLoadingTask.destroy();
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("deduplicates resource streams reached concurrently", async function () {
|
||||
const pdfData = buildSharedImageResourcePdf();
|
||||
const loadingTask = getDocument({ data: pdfData.slice() });
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
|
||||
// The source page has two XObject names pointing at the same image
|
||||
// stream. Those references are cloned concurrently from the resource
|
||||
// dictionary, and must not be mistaken for a reference cycle.
|
||||
const data = await pdfDoc.extractPages([
|
||||
{ document: pdfData.slice() },
|
||||
{ document: pdfData.slice() },
|
||||
]);
|
||||
expect(countMarker(data, "/Subtype /Image")).toEqual(1);
|
||||
|
||||
const newLoadingTask = getDocument({ data });
|
||||
const newPdfDoc = await newLoadingTask.promise;
|
||||
expect(newPdfDoc.numPages).toEqual(2);
|
||||
await newPdfDoc.getPage(1);
|
||||
await newPdfDoc.getPage(2);
|
||||
await newLoadingTask.destroy();
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("should merge two PDFs with page included ranges", async function () {
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("tracemonkey.pdf")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user