mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-02-08 00:21:11 +01:00
For now it's just possible to create a single pdf in selecting some pages in different pdf sources. The merge is for now pretty basic (it's why it's still a WIP) none of these data are merged for now: - the struct trees - the page labels - the outlines - named destinations For there are 2 new ref tests where some new pdfs are created: one with some extracted pages and an other one (encrypted) which is just rewritten. The ref images are generated from the original pdfs in selecting the page we want and the new images are taken from the generated pdfs.
498 lines
14 KiB
JavaScript
498 lines
14 KiB
JavaScript
/* Copyright 2020 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
import { bytesToString, info, warn } from "../shared/util.js";
|
|
import { Dict, isName, Name, Ref } from "./primitives.js";
|
|
import {
|
|
escapePDFName,
|
|
escapeString,
|
|
getSizeInBytes,
|
|
parseXFAPath,
|
|
} from "./core_utils.js";
|
|
import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js";
|
|
import { Stream, StringStream } from "./stream.js";
|
|
import { BaseStream } from "./base_stream.js";
|
|
import { calculateMD5 } from "./calculate_md5.js";
|
|
|
|
async function writeObject(
|
|
ref,
|
|
obj,
|
|
buffer,
|
|
{ encrypt = null, encryptRef = null }
|
|
) {
|
|
// Avoid to encrypt the encrypt dictionary.
|
|
const transform =
|
|
encrypt && encryptRef !== ref
|
|
? encrypt.createCipherTransform(ref.num, ref.gen)
|
|
: null;
|
|
buffer.push(`${ref.num} ${ref.gen} obj\n`);
|
|
await writeValue(obj, buffer, transform);
|
|
buffer.push("\nendobj\n");
|
|
}
|
|
|
|
async function writeDict(dict, buffer, transform) {
|
|
buffer.push("<<");
|
|
for (const [key, rawObj] of dict.getRawEntries()) {
|
|
buffer.push(` /${escapePDFName(key)} `);
|
|
await writeValue(rawObj, buffer, transform);
|
|
}
|
|
buffer.push(">>");
|
|
}
|
|
|
|
async function writeStream(stream, buffer, transform) {
|
|
stream = stream.getOriginalStream();
|
|
stream.reset();
|
|
let bytes = stream.getBytes();
|
|
const { dict } = stream;
|
|
|
|
const [filter, params] = await Promise.all([
|
|
dict.getAsync("Filter"),
|
|
dict.getAsync("DecodeParms"),
|
|
]);
|
|
|
|
const filterZero = Array.isArray(filter)
|
|
? await dict.xref.fetchIfRefAsync(filter[0])
|
|
: filter;
|
|
const isFilterZeroFlateDecode = isName(filterZero, "FlateDecode");
|
|
|
|
// If the string is too small there is no real benefit in compressing it.
|
|
// The number 256 is arbitrary, but it should be reasonable.
|
|
const MIN_LENGTH_FOR_COMPRESSING = 256;
|
|
|
|
if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING && !isFilterZeroFlateDecode) {
|
|
try {
|
|
const cs = new CompressionStream("deflate");
|
|
const writer = cs.writable.getWriter();
|
|
await writer.ready;
|
|
writer
|
|
.write(bytes)
|
|
.then(async () => {
|
|
await writer.ready;
|
|
await writer.close();
|
|
})
|
|
.catch(() => {});
|
|
|
|
// Response::text doesn't return the correct data.
|
|
const buf = await new Response(cs.readable).arrayBuffer();
|
|
bytes = new Uint8Array(buf);
|
|
|
|
let newFilter, newParams;
|
|
if (!filter) {
|
|
newFilter = Name.get("FlateDecode");
|
|
} else if (!isFilterZeroFlateDecode) {
|
|
newFilter = Array.isArray(filter)
|
|
? [Name.get("FlateDecode"), ...filter]
|
|
: [Name.get("FlateDecode"), filter];
|
|
if (params) {
|
|
newParams = Array.isArray(params)
|
|
? [null, ...params]
|
|
: [null, params];
|
|
}
|
|
}
|
|
if (newFilter) {
|
|
dict.set("Filter", newFilter);
|
|
}
|
|
if (newParams) {
|
|
dict.set("DecodeParms", newParams);
|
|
}
|
|
} catch (ex) {
|
|
info(`writeStream - cannot compress data: "${ex}".`);
|
|
}
|
|
}
|
|
|
|
let string = bytesToString(bytes);
|
|
if (transform) {
|
|
string = transform.encryptString(string);
|
|
}
|
|
|
|
dict.set("Length", string.length);
|
|
await writeDict(dict, buffer, transform);
|
|
buffer.push(" stream\n", string, "\nendstream");
|
|
}
|
|
|
|
async function writeArray(array, buffer, transform) {
|
|
buffer.push("[");
|
|
for (let i = 0, ii = array.length; i < ii; i++) {
|
|
await writeValue(array[i], buffer, transform);
|
|
if (i < ii - 1) {
|
|
buffer.push(" ");
|
|
}
|
|
}
|
|
buffer.push("]");
|
|
}
|
|
|
|
async function writeValue(value, buffer, transform) {
|
|
if (value instanceof Name) {
|
|
buffer.push(`/${escapePDFName(value.name)}`);
|
|
} else if (value instanceof Ref) {
|
|
buffer.push(`${value.num} ${value.gen} R`);
|
|
} else if (Array.isArray(value) || ArrayBuffer.isView(value)) {
|
|
await writeArray(value, buffer, transform);
|
|
} else if (typeof value === "string") {
|
|
if (transform) {
|
|
value = transform.encryptString(value);
|
|
}
|
|
buffer.push(`(${escapeString(value)})`);
|
|
} else if (typeof value === "number") {
|
|
// Don't try to round numbers in general, it could lead to have degenerate
|
|
// matrices (e.g. [0.000008 0 0 0.000008 0 0]).
|
|
// The numbers must be "rounded" only when pdf.js is producing them and the
|
|
// current transformation matrix is well known.
|
|
buffer.push(value.toString());
|
|
} else if (typeof value === "boolean") {
|
|
buffer.push(value.toString());
|
|
} else if (value instanceof Dict) {
|
|
await writeDict(value, buffer, transform);
|
|
} else if (value instanceof BaseStream) {
|
|
await writeStream(value, buffer, transform);
|
|
} else if (value === null) {
|
|
buffer.push("null");
|
|
} else {
|
|
warn(`Unhandled value in writer: ${typeof value}, please file a bug.`);
|
|
}
|
|
}
|
|
|
|
function writeInt(number, size, offset, buffer) {
|
|
for (let i = size + offset - 1; i > offset - 1; i--) {
|
|
buffer[i] = number & 0xff;
|
|
number >>= 8;
|
|
}
|
|
return offset + size;
|
|
}
|
|
|
|
function writeString(string, offset, buffer) {
|
|
const ii = string.length;
|
|
for (let i = 0; i < ii; i++) {
|
|
buffer[offset + i] = string.charCodeAt(i) & 0xff;
|
|
}
|
|
return offset + ii;
|
|
}
|
|
|
|
function computeMD5(filesize, xrefInfo) {
|
|
const time = Math.floor(Date.now() / 1000);
|
|
const filename = xrefInfo.filename || "";
|
|
const md5Buffer = [
|
|
time.toString(),
|
|
filename,
|
|
filesize.toString(),
|
|
...xrefInfo.infoMap.values(),
|
|
];
|
|
const md5BufferLen = Math.sumPrecise(md5Buffer.map(str => str.length));
|
|
|
|
const array = new Uint8Array(md5BufferLen);
|
|
let offset = 0;
|
|
for (const str of md5Buffer) {
|
|
offset = writeString(str, offset, array);
|
|
}
|
|
return bytesToString(calculateMD5(array, 0, array.length));
|
|
}
|
|
|
|
function writeXFADataForAcroform(str, changes) {
|
|
const xml = new SimpleXMLParser({ hasAttributes: true }).parseFromString(str);
|
|
|
|
for (const { xfa } of changes) {
|
|
if (!xfa) {
|
|
continue;
|
|
}
|
|
const { path, value } = xfa;
|
|
if (!path) {
|
|
continue;
|
|
}
|
|
const nodePath = parseXFAPath(path);
|
|
let node = xml.documentElement.searchNode(nodePath, 0);
|
|
if (!node && nodePath.length > 1) {
|
|
// If we're lucky the last element in the path will identify the node.
|
|
node = xml.documentElement.searchNode([nodePath.at(-1)], 0);
|
|
}
|
|
if (node) {
|
|
node.childNodes = Array.isArray(value)
|
|
? value.map(val => new SimpleDOMNode("value", val))
|
|
: [new SimpleDOMNode("#text", value)];
|
|
} else {
|
|
warn(`Node not found for path: ${path}`);
|
|
}
|
|
}
|
|
const buffer = [];
|
|
xml.documentElement.dump(buffer);
|
|
return buffer.join("");
|
|
}
|
|
|
|
async function updateAcroform({
|
|
xref,
|
|
acroForm,
|
|
acroFormRef,
|
|
hasXfa,
|
|
hasXfaDatasetsEntry,
|
|
xfaDatasetsRef,
|
|
needAppearances,
|
|
changes,
|
|
}) {
|
|
if (hasXfa && !hasXfaDatasetsEntry && !xfaDatasetsRef) {
|
|
warn("XFA - Cannot save it");
|
|
}
|
|
|
|
if (!needAppearances && (!hasXfa || !xfaDatasetsRef || hasXfaDatasetsEntry)) {
|
|
return;
|
|
}
|
|
|
|
const dict = acroForm.clone();
|
|
|
|
if (hasXfa && !hasXfaDatasetsEntry) {
|
|
// We've a XFA array which doesn't contain a datasets entry.
|
|
// So we'll update the AcroForm dictionary to have an XFA containing
|
|
// the datasets.
|
|
const newXfa = acroForm.get("XFA").slice();
|
|
newXfa.splice(2, 0, "datasets");
|
|
newXfa.splice(3, 0, xfaDatasetsRef);
|
|
|
|
dict.set("XFA", newXfa);
|
|
}
|
|
|
|
if (needAppearances) {
|
|
dict.set("NeedAppearances", true);
|
|
}
|
|
|
|
changes.put(acroFormRef, {
|
|
data: dict,
|
|
});
|
|
}
|
|
|
|
function updateXFA({ xfaData, xfaDatasetsRef, changes, xref }) {
|
|
if (xfaData === null) {
|
|
const datasets = xref.fetchIfRef(xfaDatasetsRef);
|
|
xfaData = writeXFADataForAcroform(datasets.getString(), changes);
|
|
}
|
|
const xfaDataStream = new StringStream(xfaData);
|
|
xfaDataStream.dict = new Dict(xref);
|
|
xfaDataStream.dict.setIfName("Type", "EmbeddedFile");
|
|
|
|
changes.put(xfaDatasetsRef, {
|
|
data: xfaDataStream,
|
|
});
|
|
}
|
|
|
|
async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) {
|
|
buffer.push("xref\n");
|
|
const indexes = getIndexes(newRefs);
|
|
let indexesPosition = 0;
|
|
for (const { ref, data } of newRefs) {
|
|
if (ref.num === indexes[indexesPosition]) {
|
|
buffer.push(
|
|
`${indexes[indexesPosition]} ${indexes[indexesPosition + 1]}\n`
|
|
);
|
|
indexesPosition += 2;
|
|
}
|
|
// The EOL is \r\n to make sure that every entry is exactly 20 bytes long.
|
|
// (see 7.5.4 - Cross-Reference Table).
|
|
if (data !== null) {
|
|
buffer.push(
|
|
`${baseOffset.toString().padStart(10, "0")} ${Math.min(ref.gen, 0xffff).toString().padStart(5, "0")} n\r\n`
|
|
);
|
|
baseOffset += data.length;
|
|
} else {
|
|
buffer.push(
|
|
`0000000000 ${Math.min(ref.gen + 1, 0xffff)
|
|
.toString()
|
|
.padStart(5, "0")} f\r\n`
|
|
);
|
|
}
|
|
}
|
|
computeIDs(baseOffset, xrefInfo, newXref);
|
|
buffer.push("trailer\n");
|
|
await writeDict(newXref, buffer, null);
|
|
buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n");
|
|
}
|
|
|
|
function getIndexes(newRefs) {
|
|
const indexes = [];
|
|
for (const { ref } of newRefs) {
|
|
if (ref.num === indexes.at(-2) + indexes.at(-1)) {
|
|
indexes[indexes.length - 1] += 1;
|
|
} else {
|
|
indexes.push(ref.num, 1);
|
|
}
|
|
}
|
|
return indexes;
|
|
}
|
|
|
|
async function getXRefStreamTable(
|
|
xrefInfo,
|
|
baseOffset,
|
|
newRefs,
|
|
newXref,
|
|
buffer
|
|
) {
|
|
const xrefTableData = [];
|
|
let maxOffset = 0;
|
|
let maxGen = 0;
|
|
for (const { ref, data, objStreamRef, index } of newRefs) {
|
|
let gen;
|
|
maxOffset = Math.max(maxOffset, baseOffset);
|
|
// The first number in each entry is the type (see 7.5.8.3):
|
|
// 0: free object
|
|
// 1: in-use object
|
|
// 2: compressed object
|
|
if (objStreamRef) {
|
|
gen = index;
|
|
xrefTableData.push([2, objStreamRef.num, gen]);
|
|
} else if (data !== null) {
|
|
gen = Math.min(ref.gen, 0xffff);
|
|
xrefTableData.push([1, baseOffset, gen]);
|
|
baseOffset += data.length;
|
|
} else {
|
|
gen = Math.min(ref.gen + 1, 0xffff);
|
|
xrefTableData.push([0, 0, gen]);
|
|
}
|
|
maxGen = Math.max(maxGen, gen);
|
|
}
|
|
newXref.set("Index", getIndexes(newRefs));
|
|
const offsetSize = getSizeInBytes(maxOffset);
|
|
const maxGenSize = getSizeInBytes(maxGen);
|
|
const sizes = [1, offsetSize, maxGenSize];
|
|
newXref.set("W", sizes);
|
|
computeIDs(baseOffset, xrefInfo, newXref);
|
|
|
|
const structSize = Math.sumPrecise(sizes);
|
|
const data = new Uint8Array(structSize * xrefTableData.length);
|
|
const stream = new Stream(data);
|
|
stream.dict = newXref;
|
|
|
|
let offset = 0;
|
|
for (const [type, objOffset, gen] of xrefTableData) {
|
|
offset = writeInt(type, sizes[0], offset, data);
|
|
offset = writeInt(objOffset, sizes[1], offset, data);
|
|
offset = writeInt(gen, sizes[2], offset, data);
|
|
}
|
|
|
|
await writeObject(xrefInfo.newRef, stream, buffer, {});
|
|
buffer.push("startxref\n", baseOffset.toString(), "\n%%EOF\n");
|
|
}
|
|
|
|
function computeIDs(baseOffset, xrefInfo, newXref) {
|
|
if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
|
|
const md5 = computeMD5(baseOffset, xrefInfo);
|
|
newXref.set("ID", [xrefInfo.fileIds[0] || md5, md5]);
|
|
}
|
|
}
|
|
|
|
function getTrailerDict(xrefInfo, changes, useXrefStream) {
|
|
const newXref = new Dict(null);
|
|
newXref.setIfDefined("Prev", xrefInfo?.startXRef);
|
|
const refForXrefTable = xrefInfo.newRef;
|
|
if (useXrefStream) {
|
|
changes.put(refForXrefTable, { data: "" });
|
|
newXref.set("Size", refForXrefTable.num + 1);
|
|
newXref.setIfName("Type", "XRef");
|
|
} else {
|
|
newXref.set("Size", refForXrefTable.num);
|
|
}
|
|
newXref.setIfDefined("Root", xrefInfo?.rootRef);
|
|
newXref.setIfDefined("Info", xrefInfo?.infoRef);
|
|
newXref.setIfDefined("Encrypt", xrefInfo?.encryptRef);
|
|
|
|
return newXref;
|
|
}
|
|
|
|
async function writeChanges(changes, xref, buffer = []) {
|
|
const newRefs = [];
|
|
for (const [ref, { data, objStreamRef, index }] of changes.items()) {
|
|
if (objStreamRef) {
|
|
newRefs.push({ ref, data, objStreamRef, index });
|
|
continue;
|
|
}
|
|
if (data === null || typeof data === "string") {
|
|
newRefs.push({ ref, data });
|
|
continue;
|
|
}
|
|
await writeObject(ref, data, buffer, xref);
|
|
newRefs.push({ ref, data: buffer.join("") });
|
|
buffer.length = 0;
|
|
}
|
|
return newRefs.sort((a, b) => /* compare the refs */ a.ref.num - b.ref.num);
|
|
}
|
|
|
|
async function incrementalUpdate({
|
|
originalData,
|
|
xrefInfo,
|
|
changes,
|
|
xref = null,
|
|
hasXfa = false,
|
|
xfaDatasetsRef = null,
|
|
hasXfaDatasetsEntry = false,
|
|
needAppearances,
|
|
acroFormRef = null,
|
|
acroForm = null,
|
|
xfaData = null,
|
|
useXrefStream = false,
|
|
}) {
|
|
await updateAcroform({
|
|
xref,
|
|
acroForm,
|
|
acroFormRef,
|
|
hasXfa,
|
|
hasXfaDatasetsEntry,
|
|
xfaDatasetsRef,
|
|
needAppearances,
|
|
changes,
|
|
});
|
|
|
|
if (hasXfa) {
|
|
updateXFA({
|
|
xfaData,
|
|
xfaDatasetsRef,
|
|
changes,
|
|
xref,
|
|
});
|
|
}
|
|
|
|
const newXref = getTrailerDict(xrefInfo, changes, useXrefStream);
|
|
const buffer = [];
|
|
const newRefs = await writeChanges(changes, xref, buffer);
|
|
let baseOffset = originalData.length;
|
|
const lastByte = originalData.at(-1);
|
|
if (lastByte !== /* \n */ 0x0a && lastByte !== /* \r */ 0x0d) {
|
|
// Avoid to concatenate %%EOF with an object definition
|
|
buffer.push("\n");
|
|
baseOffset += 1;
|
|
}
|
|
|
|
for (const { data } of newRefs) {
|
|
if (data !== null) {
|
|
buffer.push(data);
|
|
}
|
|
}
|
|
|
|
await (useXrefStream
|
|
? getXRefStreamTable(xrefInfo, baseOffset, newRefs, newXref, buffer)
|
|
: getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer));
|
|
|
|
const totalLength =
|
|
originalData.length + Math.sumPrecise(buffer.map(str => str.length));
|
|
const array = new Uint8Array(totalLength);
|
|
|
|
// Original data
|
|
array.set(originalData);
|
|
let offset = originalData.length;
|
|
|
|
// New data
|
|
for (const str of buffer) {
|
|
offset = writeString(str, offset, array);
|
|
}
|
|
|
|
return array;
|
|
}
|
|
|
|
export { incrementalUpdate, writeChanges, writeDict, writeObject, writeValue };
|