Mostly working
Some checks are pending
CI / Test (20) (push) Waiting to run
CI / Test (22) (push) Waiting to run
CI / Test (23) (push) Waiting to run
CodeQL / Analyze (javascript) (push) Waiting to run
Lint / Lint (lts/*) (push) Waiting to run
Types tests / Test (lts/*) (push) Waiting to run

* ranges passed through
* updates possible
This commit is contained in:
Kilian Schüttler 2025-03-26 17:24:25 +01:00
parent 1dc874260b
commit 20b6e32907
8 changed files with 256 additions and 103 deletions

View File

@ -11,7 +11,7 @@ const STANDARD_FONT_DATA_URL =
// Loading file from file system into typed array.
const pdfPath =
process.argv[2] || "../../web/compressed.tracemonkey-pldi-09.pdf";
process.argv[2] || "C:\\Users\\kj131\\pdf-forge\\test_pdfs\\ISO_32000-2_2020(en).pdf";
const data = new Uint8Array(fs.readFileSync(pdfPath));
// Load the PDF file.
@ -21,12 +21,46 @@ const loadingTask = getDocument({
cMapPacked: CMAP_PACKED,
standardFontDataUrl: STANDARD_FONT_DATA_URL,
});
try {
const pdfDocument = await loadingTask.promise;
console.log("# PDF document loaded.");
const page = await pdfDocument.getPage(1);
const opList = await page.getOperatorList();
console.log(opList);
} catch (e) {
console.error(e);
test(loadingTask);
async function test(loading) {
try {
const pdfDocument = await loading.promise;
console.log("# PDF document loaded.");
const page = await pdfDocument.getPage(4);
printOpList(page);
console.time("contents");
const contents = await page.getContents();
console.timeEnd("contents");
console.time("oplist");
const opList = await page.getOperatorList();
console.timeEnd("oplist");
// console.log(opList);
let newContents = "";
for (let i = 0; i < 5; i++) {
const range = opList.rangeArray[i];
if (range) {
newContents += contents.slice(range[0], range[1]);
newContents += "\n";
}
}
console.log(newContents);
await page.updateContents(newContents);
await printOpList(page);
} catch (e) {
console.error(e);
}
}
async function printOpList(page) {
const contents = await page.getContents();
const opList = await page.getOperatorList();
// console.log(opList);
const ops = [];
for (let i = 0; i < opList.rangeArray.length; i++) {
const range = opList.rangeArray[i];
if (range) {
ops.push(contents.slice(range[0], range[1]));
}
}
console.log(ops.slice(0, 100));
}

View File

@ -45,7 +45,7 @@ function bytesToString(bytes) {
}
async function parse(doc) {
const path = "/Page2/Contents/2";
const path = "/Page2/Contents/1";
let [stream] = await getPrimitive(path, doc);
const lexer = new Lexer(stream);
const parser = new Parser({ lexer, xref: doc.xref, trackRanges: true });
@ -63,12 +63,12 @@ async function parse(doc) {
const bytes = stream.getBytes();
const classes = new Set();
for (const o of objs) {
console.log(o[0].constructor.name);
// console.log(o[0].constructor.name);
classes.add(o[0].constructor.name);
const lexemmeBytes = bytes.slice(o[1], o[2]);
console.log(bytesToString(lexemmeBytes));
// console.log(bytesToString(lexemmeBytes));
}
console.log("unique classes", classes);
// console.log("unique classes", classes);
[stream] = await getPrimitive(path, doc);
const preprocessor = new EvaluatorPreprocessor(stream, doc.xref);
const operation = {};
@ -78,10 +78,10 @@ async function parse(doc) {
const fn = operation.fn;
const range = operation.range;
const op = bytesToString(bytes.slice(range[0], range[1]));
console.log(args, fn);
console.log(`${range[0]}------------------------------------`);
console.log(op);
console.log(`${range[1]}------------------------------------`);
// console.log(args, fn);
// console.log(`----------------- ${range} -------------------`);
console.log(`${fn}: ${op}`);
// console.log(`---------------------------------------------`);
}
// console.time("xref");
// let table = await retrieveXref(doc);

View File

@ -65,7 +65,7 @@ import { Catalog } from "./catalog.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { DatasetReader } from "./dataset_reader.js";
import { Linearization } from "./parser.js";
import { NullStream } from "./stream.js";
import { NullStream, StringStream } from "./stream.js";
import { ObjectLoader } from "./object_loader.js";
import { OperatorList } from "./operator_list.js";
import { PartialEvaluator } from "./evaluator.js";
@ -107,6 +107,8 @@ class Page {
this.resourcesPromise = null;
this.xfaFactory = xfaFactory;
this.updatedContents = null;
const idCounters = {
obj: 0,
};
@ -246,10 +248,19 @@ class Page {
throw reason;
}
setContents(newContents) {
this.updatedContents = newContents;
}
/**
* @returns {Promise<BaseStream>}
*/
getContentStream() {
if (this.updatedContents !== null) {
return new Promise(resolve => {
resolve(new StringStream(this.updatedContents));
});
}
return this.pdfManager.ensure(this, "content").then(content => {
if (content instanceof BaseStream) {
return content;
@ -426,8 +437,11 @@ class Page {
cacheKey,
annotationStorage = null,
modifiedIds = null,
contentOverride = null,
}) {
const contentStreamPromise = this.getContentStream();
const contentStreamPromise = contentOverride
? new StringStream(contentOverride)
: this.getContentStream();
const resourcesPromise = this.loadResources([
"ColorSpace",
"ExtGState",

View File

@ -802,6 +802,7 @@ class PartialEvaluator {
operatorList.addImageOps(
OPS.paintImageXObject,
args,
range,
optionalContent,
hasMask
);
@ -1445,15 +1446,20 @@ class PartialEvaluator {
minMax = [Infinity, Infinity, -Infinity, -Infinity];
break;
}
operatorList.addOp(OPS.constructPath, [[fn], args, minMax], range);
operatorList.addOp(
OPS.constructPath,
[[fn], args, minMax, [range[0], range[1]]],
range
);
if (parsingText) {
operatorList.addOp(OPS.restore, null, range);
operatorList.addOp(OPS.restore, null);
}
} else {
const opArgs = operatorList.argsArray[lastIndex];
opArgs[0].push(fn);
opArgs[1].push(...args);
opArgs[3].push(...range);
const minMax = opArgs[2];
const opRange = operatorList.rangeArray[lastIndex];
@ -5215,7 +5221,7 @@ class EvaluatorPreprocessor {
operation.fn = fn;
operation.args = args;
const end = this.parser.getPosition();
const end = this.parser.getEnd();
operation.range = [start, end];
return true;
}

View File

@ -59,12 +59,7 @@ function getInlineImageCacheKey(bytes) {
}
class Parser {
constructor({
lexer,
xref,
allowStreams = false,
recoveryMode = false,
}) {
constructor({ lexer, xref, allowStreams = false, recoveryMode = false }) {
this.lexer = lexer;
this.xref = xref;
this.allowStreams = allowStreams;
@ -75,8 +70,8 @@ class Parser {
}
refill() {
const [buf1, start1, end1] = this.lexer.getObj(true);
const [buf2, start2, end2] = this.lexer.getObj(true);
const [buf1, start1, end1] = this.lexer.getObjWithRange();
const [buf2, start2, end2] = this.lexer.getObjWithRange();
this.buf1 = buf1;
this.range1 = [start1, end1];
this.buf2 = buf2;
@ -87,12 +82,14 @@ class Parser {
if (this.buf2 instanceof Cmd && this.buf2.cmd === "ID") {
this.buf1 = this.buf2;
this.buf2 = null;
this.lastEnd = this.range1[1];
this.range1 = this.range2;
this.range2 = null;
} else {
this.buf1 = this.buf2;
this.lastEnd = this.range1[1];
this.range1 = this.range2;
const [buf2, start2, end2] = this.lexer.getObj(true);
const [buf2, start2, end2] = this.lexer.getObjWithRange();
this.buf2 = buf2;
this.range2 = [start2, end2];
}
@ -116,6 +113,10 @@ class Parser {
return this.range1 ? this.range1[0] : 0;
}
getEnd() {
return this.lastEnd ?? 0;
}
getObjWithRange(cipherTransform = null) {
const start = this.range1[0];
const obj = this.getObj(cipherTransform);
@ -1229,14 +1230,18 @@ class Lexer {
}
getObjWithRange() {
// at the start of getObj() the stream has stepped beyond currentChar by one
const start = this.stream.pos - 1;
const ch = this._skipWhitespaceAndComments();
if (ch === EOF) {
return [ch, -1, -1];
}
// currentChar is always at pos - 1
const start = Math.max(this.stream.pos - 1, 0);
const obj = this.getObj();
const end = this.stream.pos;
const end = this.stream.pos - 1;
return [obj, start, end];
}
getObj(withRange = false) {
_skipWhitespaceAndComments() {
// Skip whitespace and comments.
let comment = false;
let ch = this.currentChar;
@ -1255,16 +1260,14 @@ class Lexer {
}
ch = this.nextChar();
}
const start = this.stream.pos - 1;
const obj = this.extracted(ch);
const end = this.stream.pos - 1;
if (withRange) {
return [start, obj, end];
}
return obj;
return ch;
}
extracted(ch) {
getObj() {
let ch = this._skipWhitespaceAndComments();
if (ch === EOF) {
return ch;
}
// Start reading a token.
switch (ch | 0) {
case 0x30: // '0'

View File

@ -123,7 +123,7 @@ class WorkerMessageHandler {
if (apiVersion !== workerVersion) {
throw new Error(
`The API version "${apiVersion}" does not match ` +
`the Worker version "${workerVersion}".`
`the Worker version "${workerVersion}".`
);
}
@ -141,8 +141,8 @@ class WorkerMessageHandler {
if (enumerableProperties.length) {
throw new Error(
"The `Array.prototype` contains unexpected enumerable properties: " +
enumerableProperties.join(", ") +
"; thus breaking e.g. `for...in` iteration of `Array`s."
enumerableProperties.join(", ") +
"; thus breaking e.g. `for...in` iteration of `Array`s."
);
}
}
@ -206,15 +206,15 @@ class WorkerMessageHandler {
}
async function getPdfManager({
data,
password,
disableAutoFetch,
rangeChunkSize,
length,
docBaseUrl,
enableXfa,
evaluatorOptions,
}) {
data,
password,
disableAutoFetch,
rangeChunkSize,
length,
docBaseUrl,
enableXfa,
evaluatorOptions,
}) {
const pdfManagerArgs = {
source: null,
disableAutoFetch,
@ -242,7 +242,7 @@ class WorkerMessageHandler {
loaded = 0;
fullRequest.headersReady
.then(function () {
.then(function() {
if (!fullRequest.isRangeSupported) {
return;
}
@ -263,13 +263,13 @@ class WorkerMessageHandler {
pdfManagerCapability.resolve(newPdfManager);
cancelXHRs = null;
})
.catch(function (reason) {
.catch(function(reason) {
pdfManagerCapability.reject(reason);
cancelXHRs = null;
});
new Promise(function (resolve, reject) {
const readChunk = function ({ value, done }) {
new Promise(function(resolve, reject) {
const readChunk = function({ value, done }) {
try {
ensureNotTerminated();
if (done) {
@ -314,7 +314,7 @@ class WorkerMessageHandler {
}
};
fullRequest.read().then(readChunk, reject);
}).catch(function (e) {
}).catch(function(e) {
pdfManagerCapability.reject(e);
cancelXHRs = null;
});
@ -341,12 +341,12 @@ class WorkerMessageHandler {
handler
.sendWithPromise("PasswordRequest", ex)
.then(function ({ password }) {
.then(function({ password }) {
finishWorkerTask(task);
pdfManager.updatePassword(password);
pdfManagerReady();
})
.catch(function () {
.catch(function() {
finishWorkerTask(task);
handler.send("DocException", ex);
});
@ -359,7 +359,7 @@ class WorkerMessageHandler {
function pdfManagerReady() {
ensureNotTerminated();
loadDocument(false).then(onSuccess, function (reason) {
loadDocument(false).then(onSuccess, function(reason) {
ensureNotTerminated();
// Try again with recoveryMode == true
@ -367,7 +367,7 @@ class WorkerMessageHandler {
onFailure(reason);
return;
}
pdfManager.requestLoadedStream().then(function () {
pdfManager.requestLoadedStream().then(function() {
ensureNotTerminated();
loadDocument(true).then(onSuccess, onFailure);
@ -378,7 +378,7 @@ class WorkerMessageHandler {
ensureNotTerminated();
getPdfManager(data)
.then(function (newPdfManager) {
.then(function(newPdfManager) {
if (terminated) {
// We were in a process of setting up the manager, but it got
// terminated in the middle.
@ -396,14 +396,14 @@ class WorkerMessageHandler {
.then(pdfManagerReady, onFailure);
}
handler.on("GetPage", function (data) {
return pdfManager.getPage(data.pageIndex).then(function (page) {
handler.on("GetPage", function(data) {
return pdfManager.getPage(data.pageIndex).then(function(page) {
return Promise.all([
pdfManager.ensure(page, "rotate"),
pdfManager.ensure(page, "ref"),
pdfManager.ensure(page, "userUnit"),
pdfManager.ensure(page, "view"),
]).then(function ([rotate, ref, userUnit, view]) {
]).then(function([rotate, ref, userUnit, view]) {
return {
rotate,
ref,
@ -415,104 +415,104 @@ class WorkerMessageHandler {
});
});
handler.on("GetPageIndex", function (data) {
handler.on("GetPageIndex", function(data) {
const pageRef = Ref.get(data.num, data.gen);
return pdfManager.ensureCatalog("getPageIndex", [pageRef]);
});
handler.on("GetDestinations", function (data) {
handler.on("GetDestinations", function(data) {
return pdfManager.ensureCatalog("destinations");
});
handler.on("GetDestination", function (data) {
handler.on("GetDestination", function(data) {
return pdfManager.ensureCatalog("getDestination", [data.id]);
});
handler.on("GetPageLabels", function (data) {
handler.on("GetPageLabels", function(data) {
return pdfManager.ensureCatalog("pageLabels");
});
handler.on("GetPageLayout", function (data) {
handler.on("GetPageLayout", function(data) {
return pdfManager.ensureCatalog("pageLayout");
});
handler.on("GetPageMode", function (data) {
handler.on("GetPageMode", function(data) {
return pdfManager.ensureCatalog("pageMode");
});
handler.on("GetViewerPreferences", function (data) {
handler.on("GetViewerPreferences", function(data) {
return pdfManager.ensureCatalog("viewerPreferences");
});
handler.on("GetOpenAction", function (data) {
handler.on("GetOpenAction", function(data) {
return pdfManager.ensureCatalog("openAction");
});
handler.on("GetAttachments", function (data) {
handler.on("GetAttachments", function(data) {
return pdfManager.ensureCatalog("attachments");
});
handler.on("GetDocJSActions", function (data) {
handler.on("GetDocJSActions", function(data) {
return pdfManager.ensureCatalog("jsActions");
});
handler.on("GetPageJSActions", function ({ pageIndex }) {
return pdfManager.getPage(pageIndex).then(function (page) {
handler.on("GetPageJSActions", function({ pageIndex }) {
return pdfManager.getPage(pageIndex).then(function(page) {
return pdfManager.ensure(page, "jsActions");
});
});
handler.on("GetOutline", function (data) {
handler.on("GetOutline", function(data) {
return pdfManager.ensureCatalog("documentOutline");
});
handler.on("GetOptionalContentConfig", function (data) {
handler.on("GetOptionalContentConfig", function(data) {
return pdfManager.ensureCatalog("optionalContentConfig");
});
handler.on("GetPermissions", function (data) {
handler.on("GetPermissions", function(data) {
return pdfManager.ensureCatalog("permissions");
});
handler.on("GetMetadata", function (data) {
handler.on("GetMetadata", function(data) {
return Promise.all([
pdfManager.ensureDoc("documentInfo"),
pdfManager.ensureCatalog("metadata"),
]);
});
handler.on("GetMarkInfo", function (data) {
handler.on("GetMarkInfo", function(data) {
return pdfManager.ensureCatalog("markInfo");
});
handler.on("GetData", function (data) {
return pdfManager.requestLoadedStream().then(function (stream) {
handler.on("GetData", function(data) {
return pdfManager.requestLoadedStream().then(function(stream) {
return stream.bytes;
});
});
handler.on("GetPrimitiveByPath", function (path_str) {
handler.on("GetPrimitiveByPath", function(path_str) {
return getPrim(path_str, pdfManager.pdfDocument);
});
handler.on("GetXRefEntries", function (data) {
handler.on("GetXRefEntries", function(data) {
return retrieveXref(pdfManager.pdfDocument);
});
handler.on("GetPrimTree", function (request) {
handler.on("GetPrimTree", function(request) {
return getPrimTree(request, pdfManager.pdfDocument);
});
handler.on("GetImageData", function (path) {
handler.on("GetImageData", function(path) {
return getImageAsBlob(path, pdfManager.pdfDocument);
});
handler.on("GetStreamAsString", function (path) {
handler.on("GetStreamAsString", function(path) {
return getStreamAsString(path, pdfManager.pdfDocument);
});
handler.on("GetAnnotations", function ({ pageIndex, intent }) {
return pdfManager.getPage(pageIndex).then(function (page) {
handler.on("GetAnnotations", function({ pageIndex, intent }) {
return pdfManager.getPage(pageIndex).then(function(page) {
const task = new WorkerTask(`GetAnnotations: page ${pageIndex}`);
startWorkerTask(task);
@ -529,23 +529,23 @@ class WorkerMessageHandler {
});
});
handler.on("GetFieldObjects", function (data) {
handler.on("GetFieldObjects", function(data) {
return pdfManager
.ensureDoc("fieldObjects")
.then(fieldObjects => fieldObjects?.allFields || null);
});
handler.on("HasJSActions", function (data) {
handler.on("HasJSActions", function(data) {
return pdfManager.ensureDoc("hasJSActions");
});
handler.on("GetCalculationOrderIds", function (data) {
handler.on("GetCalculationOrderIds", function(data) {
return pdfManager.ensureDoc("calculationOrderIds");
});
handler.on(
"SaveDocument",
async function ({ isPureXfa, numPages, annotationStorage, filename }) {
async function({ isPureXfa, numPages, annotationStorage, filename }) {
const globalPromises = [
pdfManager.requestLoadedStream(),
pdfManager.ensureCatalog("acroForm"),
@ -615,7 +615,7 @@ class WorkerMessageHandler {
imagePromises,
changes
)
.finally(function () {
.finally(function() {
finishWorkerTask(task);
});
})
@ -652,13 +652,13 @@ class WorkerMessageHandler {
} else {
for (let pageIndex = 0; pageIndex < numPages; pageIndex++) {
promises.push(
pdfManager.getPage(pageIndex).then(function (page) {
pdfManager.getPage(pageIndex).then(function(page) {
const task = new WorkerTask(`Save: page ${pageIndex}`);
startWorkerTask(task);
return page
.save(handler, task, annotationStorage, changes)
.finally(function () {
.finally(function() {
finishWorkerTask(task);
});
})
@ -748,6 +748,31 @@ class WorkerMessageHandler {
}
);
handler.on("StreamContents", function(data, sink) {
const pageIndex = data.pageIndex;
pdfManager.getPage(pageIndex).then(function(page) {
page.getContentStream().then(stream => {
let byte;
let string = "";
while ((byte = stream.getByte()) !== -1) {
string += String.fromCharCode(byte);
}
sink.enqueue(string, string.length);
sink.close();
});
});
});
handler.on("UpdateContents", function (data) {
return new Promise(resolve => {
const pageIndex = data.pageIndex;
pdfManager.getPage(pageIndex).then(function (page) {
page.setContents(data.value);
resolve();
});
});
});
handler.on("GetOperatorList", function (data, sink) {
const pageIndex = data.pageIndex;
pdfManager.getPage(pageIndex).then(function (page) {
@ -767,6 +792,7 @@ class WorkerMessageHandler {
cacheKey: data.cacheKey,
annotationStorage: data.annotationStorage,
modifiedIds: data.modifiedIds,
contentOverride: data.contentOverride,
})
.then(
function (operatorListInfo) {

View File

@ -35,6 +35,7 @@ import {
AbortException,
AnnotationMode,
assert,
djb2Hash,
FeatureTest,
getVerbosityLevel,
info,
@ -1660,6 +1661,37 @@ class PDFPageProxy {
return renderTask;
}
updateContents(newContents) {
if (!newContents) {
throw new Error("Contents may not be null or undefined");
}
this._intentStates.clear();
return this._transport.updateContents(newContents, this._pageIndex);
}
getContents() {
const readableStream = this._transport.streamContents(this._pageIndex);
return new Promise(function (resolve, reject) {
function pump() {
reader.read().then(function ({ value, done }) {
if (done) {
resolve(textContent.text);
return;
}
textContent.text += value;
pump();
}, reject);
}
const reader = readableStream.getReader();
const textContent = {
text: "",
};
pump();
});
}
/**
* @param {GetOperatorListParameters} params - Page getOperatorList
* parameters.
@ -1671,6 +1703,7 @@ class PDFPageProxy {
annotationMode = AnnotationMode.ENABLE,
printAnnotationStorage = null,
isEditing = false,
contentOverride = null,
} = {}) {
if (typeof PDFJSDev !== "undefined" && !PDFJSDev.test("GENERIC")) {
throw new Error("Not implemented: getOperatorList");
@ -1688,7 +1721,8 @@ class PDFPageProxy {
annotationMode,
printAnnotationStorage,
isEditing,
/* isOpList = */ true
/* isOpList = */ true,
contentOverride
);
let intentState = this._intentStates.get(intentArgs.cacheKey);
if (!intentState) {
@ -1910,6 +1944,7 @@ class PDFPageProxy {
cacheKey,
annotationStorageSerializable,
modifiedIds,
contentOverride,
}) {
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
assert(
@ -1927,6 +1962,7 @@ class PDFPageProxy {
cacheKey,
annotationStorage: map,
modifiedIds,
contentOverride,
},
transfer
);
@ -2528,7 +2564,8 @@ class WorkerTransport {
annotationMode = AnnotationMode.ENABLE,
printAnnotationStorage = null,
isEditing = false,
isOpList = false
isOpList = false,
contentOverride = null
) {
let renderingIntent = RenderingIntentFlag.DISPLAY; // Default value.
let annotationStorageSerializable = SerializableEmpty;
@ -2586,11 +2623,16 @@ class WorkerTransport {
modifiedIdsHash,
];
if (contentOverride) {
cacheKeyBuf.push(djb2Hash(contentOverride));
}
return {
renderingIntent,
cacheKey: cacheKeyBuf.join("_"),
annotationStorageSerializable,
modifiedIds,
contentOverride,
};
}
@ -2974,6 +3016,23 @@ class WorkerTransport {
return this.messageHandler.sendWithPromise("GetXRefEntries", null);
}
streamContents(pageIndex) {
return this.messageHandler.sendWithStream("StreamContents", {
pageIndex,
});
}
/**
* @returns {Promise<void>} A promise that is resolved once the contents
* are updated.
*/
updateContents(newContents, pageIndex) {
return this.messageHandler.sendWithPromise("UpdateContents", {
value: newContents,
pageIndex,
});
}
saveDocument() {
if (this.annotationStorage.size <= 0) {
warn(

View File

@ -578,6 +578,16 @@ function objectFromMap(map) {
return obj;
}
// fast and easy hash
function djb2Hash(str) {
let hash = 5381;
for (let i = 0; i < str.length; i++) {
hash = (hash << 5) + hash + str.charCodeAt(i);
hash &= hash; // Convert to 32-bit integer
}
return hash >>> 0; // Convert to unsigned
}
// Checks the endianness of the platform.
function isLittleEndian() {
const buffer8 = new Uint8Array(4);
@ -1136,6 +1146,7 @@ export {
BASELINE_FACTOR,
bytesToString,
createValidAbsoluteUrl,
djb2Hash,
DocumentActionEventType,
FeatureTest,
FONT_IDENTITY_MATRIX,