fixed some bugs in the api
Some checks failed
CI / Test (20) (push) Has been cancelled
CI / Test (22) (push) Has been cancelled
CI / Test (23) (push) Has been cancelled
CodeQL / Analyze (javascript) (push) Has been cancelled
Lint / Lint (lts/*) (push) Has been cancelled
Types tests / Test (lts/*) (push) Has been cancelled

This commit is contained in:
Kilian Schuettler 2025-03-01 01:00:35 +01:00
parent e8063c88b1
commit 88a38b6c5b
7 changed files with 2038 additions and 2016 deletions

View File

@ -0,0 +1,31 @@
import fs from "fs";
import { getDocument } from "../../build/dist/build/pdf.mjs";
// Some PDFs need external cmaps.
const CMAP_URL = "../../../node_modules/pdfjs-dist/cmaps/";
const CMAP_PACKED = true;
// Where the standard fonts are located.
const STANDARD_FONT_DATA_URL =
"../../../node_modules/pdfjs-dist/standard_fonts/";
// Loading file from file system into typed array.
const pdfPath =
process.argv[2] || "../../web/compressed.tracemonkey-pldi-09.pdf";
const data = new Uint8Array(fs.readFileSync(pdfPath));
// Load the PDF file.
const loadingTask = getDocument({
data,
cMapUrl: CMAP_URL,
cMapPacked: CMAP_PACKED,
standardFontDataUrl: STANDARD_FONT_DATA_URL,
});
try {
const pdfDocument = await loadingTask.promise;
console.log("# PDF document loaded.");
const table = await pdfDocument.getStreamAsString("/Page2/Contents/0/Data");
console.log(table);
} catch (e) {
console.error(e);
}

View File

@ -0,0 +1,90 @@
import {
getImageAsBlob,
getPrim,
getPrimTree,
getStreamAsString,
} from "../../src/core/obj_walker.js";
import fs from "fs";
import { PDFDocument } from "../../src/core/document.js";
import { Stream } from "../../src/core/stream.js";
const filePath =
"C:\\Users\\kj131\\pdf-forge\\test_pdfs\\ISO_32000-2_2020(en).pdf";
fs.readFile(filePath, (err, data) => {
if (err) {
console.error("Error reading file:", err);
return;
}
console.log("Reading file");
const stream = new Stream(data); // No need for Uint8Array, `data` is already a buffer
const manager = { enableXfa: false };
const doc = new PDFDocument(manager, stream);
try {
doc.parseStartXRef();
doc.parse(false);
console.log("Number of pages:", doc.numPages);
parse(doc);
} catch (e) {
console.error("Failed to parse PDF:", e);
}
});
async function parse(doc) {
// console.time("xref");
// let table = await retrieveXref(doc);
// console.timeEnd("xref");
// console.time("get prim");
// const prim = await getPrim("/Page2/Resources/XObject/Im0/Length", doc);
// console.timeEnd("get prim");
// console.log(prim);
// const request = {
// key: "Page2",
// children: [
// { key: "CropBox" },
// { key: "Contents", children: [{ key: "1" , children: [{ key: "Length" }]}] },
// {
// key: "Resources",
// children: [{ key: "ProcSet" }],
// },
// ],
// };
// console.time("get tree");
// const tree = await getPrimTree([request], doc);
// console.timeEnd("get tree");
// logTree(tree);
console.time("string");
const string = await getStreamAsString("/Page2/Contents/0/Data", doc);
console.timeEnd("string");
console.log(string);
// const handler = { send: undefined };
// console.time("image");
// // const image = await getStreamAsImage("/Page2/Contents/2/Data", doc);
// const image = await getImageAsBlob(
// "/Page2/Resources/XObject/Im0/Data",
// doc,
// handler
// );
// console.timeEnd("image");
// console.log(image);
}
function logTree(tree) {
for (const node of tree) {
let str = " ".repeat(node.depth);
if (!node.container) {
str += " ";
} else {
str += node.expanded ? "v " : "> ";
}
str += node.key + " | " + node.ptype + " | ";
// if (node.sub_type !== "-") {
// str += node.sub_type + " | ";
// }
// str += node.value;
str += node.trace.map(t => t.key).join(", ");
console.log(str);
}
}

View File

@ -1,72 +0,0 @@
import { PDFDocument } from "../../src/core/document.js";
import { Stream } from "../../src/core/stream.js";
import fs from "fs";
import {
getPrim,
getPrimTree, getStreamAsImage,
getStreamAsString
} from "../../src/core/obj_walker.js";
import { retrieveXref } from "../../src/core/retrieve_xref.js";
fs.readFile(
"/home/kschuettler/Dokumente/Scientific Papers/PDF Specification/ISO_32000-2_2020(en).pdf",
(err, data) => {
console.log("reading file");
const stream = new Stream(new Uint8Array(data));
const manager = { enableXfa: false };
const doc = new PDFDocument(manager, stream);
doc.parseStartXRef();
doc.parse(false);
console.log(doc.numPages);
parse(doc);
}
);
async function parse(doc) {
// console.time("xref");
// let table = await retrieveXref(doc);
// console.timeEnd("xref");
console.time("get prim");
const prim = await getPrim("/Page2/Contents/1/", doc);
console.timeEnd("get prim");
// console.log(prim);
let request = {
key: "Page6",
children: [
{key: "CropBox"},
{ key: "Contents", children: [{ key: "1" }] },
{
key: "Resources",
children: [{ key: "ProcSet" }],
},
],
};
console.time("get tree");
const tree = await getPrimTree([request], doc);
console.timeEnd("get tree");
logTree(tree);
console.time("string")
const string = await getStreamAsString("/Page2/Contents/2/Data", doc);
console.timeEnd("string");
console.log(string);
console.time("image")
// const image = await getStreamAsImage("/Page2/Contents/2/Data", doc);
const image = await getStreamAsImage("/Page2/Resources/XObject/Im0/Data", doc);
console.timeEnd("image");
console.log(image);
}
function logTree(tree) {
for (let key in tree) {
let node = tree[key];
let str = " ".repeat(node.depth);
str += !node.container ? " " : node.expanded ? "v " : "> ";
str += node.key + " | " + node.ptype + " | ";
if (node.sub_type !== "-") {
str += node.sub_type + " | ";
}
str += node.value;
console.log(str);
}
}

View File

@ -1,36 +1,35 @@
import {isDict, isName, Ref} from "./primitives.js";
import {BaseStream} from "./base_stream.js";
import {PDFImage} from "./image.js";
import {PartialEvaluator} from "./evaluator.js";
import {OperatorList} from "./operator_list.js";
import {LocalColorSpaceCache} from "./image_utils.js";
import { Dict, Name, Ref } from "./primitives.js";
import { BaseStream } from "./base_stream.js";
import { LocalColorSpaceCache } from "./image_utils.js";
import { PDFFunctionFactory } from "./function.js";
import { PDFImage } from "./image.js";
async function getPrim(path, doc) {
const [prim, trace] = await getPrimitive(path, doc);
return toModel(trace[trace.length - 1].key, trace, prim);
return toModel(trace.at(-1).key, trace, prim);
}
async function getStreamAsString(path, doc) {
if (!path.endsWith("Data")) {
throw new Error(`Path ${path} does not end with Data!`);
}
const [prim, trace] = await getPrimitive(path.replace("/Data", ""), doc);
const [prim] = await getPrimitive(path.replace("/Data", ""), doc);
if ((!prim) instanceof BaseStream) {
throw new Error(`Selected primitive with path ${path} is not a Stream!`);
}
const bytes = prim.getBytes();
var string = "";
for (var i = 0; i < bytes.length; i++) {
let string = "";
for (let i = 0; i < bytes.length; i++) {
string += String.fromCharCode(bytes[i]);
}
return string;
}
async function getStreamAsImage(path, doc) {
async function getImageAsBlob(path, doc) {
if (!path.endsWith("Data")) {
throw new Error(`Path ${path} does not end with Data!`);
}
const [prim, trace] = await getPrimitive(path.replace("/Data", ""), doc);
const [prim] = await getPrimitive(path.replace("/Data", ""), doc);
if ((!prim) instanceof BaseStream) {
throw new Error(`Selected primitive with path ${path} is not a Stream!`);
}
@ -38,22 +37,18 @@ async function getStreamAsImage(path, doc) {
if (!info || info.getRaw("Subtype")?.name !== "Image") {
throw new Error(`Selected Stream is not an Image!"`);
}
const page = await doc.getPage(1);
const evaluator = new PartialEvaluator({
const pdfFunctionFactory = new PDFFunctionFactory({
xref: doc.xref,
isEvalSupported: true,
});
const pdfImage = new PDFImage({
xref: doc.xref,
handler: {sendWithPromise: undefined},
pageIndex: 1,
idFactory: page._localIdFactory,
})
const operatorList = new OperatorList();
await evaluator.buildPaintImageXObject({
resources: [],
image: prim,
operatorList,
localImageCache: doc.catalog.globalImageCache,
pdfFunctionFactory,
localColorSpaceCache: new LocalColorSpaceCache(),
})
return operatorList.;
});
const imageData = await pdfImage.createImageData(true, false);
return new Blob([imageData.data], { type: "image/png" });
}
async function getPrimitive(path, doc) {
@ -76,8 +71,8 @@ async function getPrimTree(request, doc) {
}
async function _getPrimTree(request, doc) {
let results = [];
let [prim, trace] = await getRoot(request.key, doc);
const results = [];
const [prim, trace] = await getRoot(request.key, doc);
const root = toModel(request.key, trace, prim);
results.push(toTreeModel(root, 0, true));
addChildren(root, request, results, prim, doc, trace, 1);
@ -86,23 +81,23 @@ async function _getPrimTree(request, doc) {
function addChildren(model, request, results, prim, doc, trace, depth) {
for (const child of model.children) {
let childRequest = request.children?.find(c => c.key === child.key);
const childRequest = request.children?.find(c => c.key === child.key);
if (childRequest) {
results.push(toTreeModel(child, depth, true));
expand(results, prim, childRequest, doc, trace, depth + 1);
expandPrim(results, prim, childRequest, doc, trace, depth + 1);
} else {
results.push(toTreeModel(child, depth, false));
}
}
}
function expand(results, rootPrim, request, doc, trace, depth) {
function expandPrim(results, rootPrim, request, doc, trace, depth) {
if (depth > 20) {
throw new Error(`Depth limit exceeded: ${depth}`);
}
let [prim, _trace] = resolveStep(doc.xref, rootPrim, trace, request.key);
const model = toModel(request.key, trace, prim);
addChildren(model, request, results, prim, doc, trace, depth);
const [prim, _trace] = resolveStep(doc.xref, rootPrim, trace, request.key);
const model = toModel(request.key, _trace, prim);
addChildren(model, request, results, prim, doc, _trace, depth);
}
function toTreeModel(primModel, depth, expand) {
@ -119,24 +114,26 @@ function toTreeModel(primModel, depth, expand) {
}
function isContainer(prim) {
return isDict(prim) || Array.isArray(prim) || isRef(prim) || isStream(prim);
return (
prim instanceof Dict || Array.isArray(prim) || isRef(prim) || isStream(prim)
);
}
async function getRoot(first, doc) {
let root;
let trace = [];
const trace = [];
if (first === "Trailer") {
root = doc.xref.trailer;
trace.push({key: first, last_jump: first});
trace.push({ key: first, last_jump: first });
} else if (first.startsWith("Page")) {
const page = await doc.getPage(+first.replace("Page", "") - 1);
const ref = page.ref;
root = doc.xref.fetch(ref);
trace.push({key: first, last_jump: ref.num});
trace.push({ key: first, last_jump: ref.num });
} else {
const ref = new Ref(+first, 0);
const ref = Ref.get(+first, 0);
root = doc.xref.fetch(ref);
trace.push({key: first, last_jump: ref.num});
trace.push({ key: first, last_jump: ref.num });
}
return [root, trace];
}
@ -157,8 +154,8 @@ function isRef(obj) {
function resolveStep(xref, root, trace, step) {
let prim;
let last_jump = trace[trace.length - 1].last_jump;
if (isDict(root)) {
const last_jump = trace.at(-1).last_jump;
if (root instanceof Dict) {
prim = root.getRaw(step);
} else if (Array.isArray(root)) {
const _step = +step;
@ -168,41 +165,43 @@ function resolveStep(xref, root, trace, step) {
);
}
prim = root[_step];
} else if (root instanceof BaseStream && root.dict) {
prim = root.dict.getRaw(step);
} else {
throw new Error(
`Unexpected step ${step} at trace: /${trace.map(t => t.key).join("/")}`
);
}
let _trace = copy(trace);
const _trace = copy(trace);
if (isRef(prim)) {
const num = prim.num;
prim = xref.fetch(prim);
_trace.push({key: step, last_jump: num});
_trace.push({ key: step, last_jump: num });
} else {
_trace.push({key: step, last_jump: last_jump});
_trace.push({ key: step, last_jump });
}
return [prim, _trace];
}
function toModel(name, trace, prim) {
const [type, subType] = toType(prim);
var value = primToString(prim);
var children = [];
if (isDict(prim)) {
let value = primToString(prim);
const children = [];
if (prim instanceof Dict) {
value = format_dict_content(prim);
const keys = prim.getKeys();
const last = trace[trace.length - 1];
const last = trace.at(-1);
keys.forEach(child => {
let _trace = copy(trace);
_trace.push({key: child, last_jump: last.last_jump});
const _trace = copy(trace);
_trace.push({ key: child, last_jump: last.last_jump });
children.push(toModel(child, _trace, prim.getRaw(child)));
});
} else if (Array.isArray(prim)) {
value = format_arr_content(prim);
const last = trace[trace.length - 1];
const last = trace.at(-1);
for (let i = 0; i < prim.length; i++) {
let _trace = copy(trace);
_trace.push({key: i.toString(), last_jump: last.last_jump});
const _trace = copy(trace);
_trace.push({ key: i.toString(), last_jump: last.last_jump });
children.push(toModel(i.toString(), _trace, prim[i]));
}
} else if (isStream(prim)) {
@ -210,14 +209,14 @@ function toModel(name, trace, prim) {
if (info_dict) {
value = format_dict_content(info_dict);
const keys = info_dict.getKeys();
const last = trace[trace.length - 1];
const last = trace.at(-1);
keys.forEach(child => {
let _trace = copy(trace);
_trace.push({key: child, last_jump: last.last_jump});
const _trace = copy(trace);
_trace.push({ key: child, last_jump: last.last_jump });
children.push(toModel(child, _trace, info_dict.getRaw(child)));
});
let _trace = copy(trace);
_trace.push({key: "Data", last_jump: last.last_jump});
const _trace = copy(trace);
_trace.push({ key: "Data", last_jump: last.last_jump });
children.push(
new PrimitiveModel("Data", "-", "-", "Stream Data", false, [], _trace)
);
@ -235,7 +234,7 @@ function toModel(name, trace, prim) {
}
function toType(prim) {
if (isDict(prim)) {
if (prim instanceof Dict) {
const subType = prim.getRaw("Type");
return ["Dictionary", subType ? subType.name : "-"];
} else if (Array.isArray(prim)) {
@ -243,7 +242,7 @@ function toType(prim) {
} else if (isStream(prim)) {
const subType = prim.dict?.getRaw("Subtype");
return ["Stream", subType ? subType.name : "-"];
} else if (isName(prim)) {
} else if (prim instanceof Name) {
return ["Name", "-"];
} else if (isInt(prim)) {
return ["Integer", "-"];
@ -255,14 +254,12 @@ function toType(prim) {
return ["String", "-"];
} else if (isRef(prim)) {
return ["Reference", "-"];
} else {
console.log(prim);
throw new Error("Unknown prim");
}
throw new Error("Unknown prim");
}
function copy(trace) {
var _trace = [];
const _trace = [];
for (let i = 0; i < trace.length; i++) {
_trace.push(trace[i]);
}
@ -270,19 +267,19 @@ function copy(trace) {
}
function isBool(v) {
return typeof v == "boolean";
return typeof v === "boolean";
}
function isInt(v) {
return typeof v == "number" && (v | 0) == v;
return typeof v === "number" && (v | 0) === v;
}
function isNum(v) {
return typeof v == "number";
return typeof v === "number";
}
function isString(v) {
return typeof v == "string";
return typeof v === "string";
}
function isStream(v) {
@ -290,13 +287,13 @@ function isStream(v) {
}
function primToString(prim) {
if (isDict(prim)) {
if (prim instanceof Dict) {
return "Dictionary";
} else if (Array.isArray(prim)) {
return "Array";
} else if (isStream(prim)) {
return "Stream";
} else if (isName(prim)) {
} else if (prim instanceof Name) {
return prim.name;
} else if (isInt(prim)) {
return prim.toString();
@ -308,10 +305,8 @@ function primToString(prim) {
return prim;
} else if (isRef(prim)) {
return "XRef(" + prim.num + ", " + prim.gen + ")";
} else {
console.log(prim);
throw new Error("Unknown prim");
}
throw new Error("Unknown prim");
}
function format_dict_content(dict) {
@ -375,11 +370,11 @@ class TreeViewModel {
}
export {
getImageAsBlob,
getPrim,
getPrimTree,
getPrimitive,
getPrimTree,
getStreamAsString,
getStreamAsImage,
PrimitiveModel,
TreeViewModel,
};

View File

@ -31,15 +31,20 @@ import {
XRefParseException,
} from "./core_utils.js";
import { Dict, isDict, Ref, RefSetCache } from "./primitives.js";
import {
getImageAsBlob,
getPrim,
getPrimTree,
getStreamAsString
} from "./obj_walker.js";
import { LocalPdfManager, NetworkPdfManager } from "./pdf_manager.js";
import { MessageHandler, wrapReason } from "../shared/message_handler.js";
import { AnnotationFactory } from "./annotation.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { incrementalUpdate } from "./writer.js";
import { PDFWorkerStream } from "./worker_stream.js";
import { retrieveXref } from "./retrieve_xref.js";
import { StructTreeRoot } from "./struct_tree.js";
import {getPrim, getPrimTree} from "./obj_walker.js";
import {retrieveXref} from "./retrieve_xref.js";
class WorkerTask {
constructor(name) {
@ -498,6 +503,14 @@ class WorkerMessageHandler {
return getPrimTree(request, pdfManager.pdfDocument);
});
handler.on("GetImageAsBlob", function (path) {
return getImageAsBlob(path, pdfManager.pdfDocument);
});
handler.on("GetStreamAsString", function (path) {
return getStreamAsString(path, pdfManager.pdfDocument);
});
handler.on("GetAnnotations", function ({ pageIndex, intent }) {
return pdfManager.getPage(pageIndex).then(function (page) {
const task = new WorkerTask(`GetAnnotations: page ${pageIndex}`);

View File

@ -17,6 +17,20 @@
* @module pdfjsLib
*/
import { DOMCMapReaderFactory } from "display-cmap_reader_factory";
import { PDFFetchStream } from "display-fetch_stream";
import { PDFNetworkStream } from "display-network";
import { PDFNodeStream } from "display-node_stream";
import {
NodeCanvasFactory,
NodeCMapReaderFactory,
NodeFilterFactory,
NodeStandardFontDataFactory,
NodeWasmFactory,
} from "display-node_utils";
import { DOMStandardFontDataFactory } from "display-standard_fontdata_factory";
import { DOMWasmFactory } from "display-wasm_factory";
import { MessageHandler, wrapReason } from "../shared/message_handler.js";
import {
AbortException,
AnnotationMode,
@ -37,7 +51,8 @@ import {
PrintAnnotationStorage,
SerializableEmpty,
} from "./annotation_storage.js";
import { FontFaceObject, FontLoader } from "./font_loader.js";
import { CanvasGraphics } from "./canvas.js";
import { DOMCanvasFactory } from "./canvas_factory.js";
import {
isDataScheme,
isValidFetchUrl,
@ -45,28 +60,13 @@ import {
RenderingCancelledException,
StatTimer,
} from "./display_utils.js";
import { MessageHandler, wrapReason } from "../shared/message_handler.js";
import {
NodeCanvasFactory,
NodeCMapReaderFactory,
NodeFilterFactory,
NodeStandardFontDataFactory,
NodeWasmFactory,
} from "display-node_utils";
import { CanvasGraphics } from "./canvas.js";
import { DOMCanvasFactory } from "./canvas_factory.js";
import { DOMCMapReaderFactory } from "display-cmap_reader_factory";
import { DOMFilterFactory } from "./filter_factory.js";
import { DOMStandardFontDataFactory } from "display-standard_fontdata_factory";
import { DOMWasmFactory } from "display-wasm_factory";
import { GlobalWorkerOptions } from "./worker_options.js";
import { FontFaceObject, FontLoader } from "./font_loader.js";
import { Metadata } from "./metadata.js";
import { OptionalContentConfig } from "./optional_content_config.js";
import { PDFDataTransportStream } from "./transport_stream.js";
import { PDFFetchStream } from "display-fetch_stream";
import { PDFNetworkStream } from "display-network";
import { PDFNodeStream } from "display-node_stream";
import { TextLayer } from "./text_layer.js";
import { PDFDataTransportStream } from "./transport_stream.js";
import { GlobalWorkerOptions } from "./worker_options.js";
import { XfaText } from "./xfa_text.js";
const DEFAULT_RANGE_CHUNK_SIZE = 65536; // 2^16 = 65536
@ -1101,23 +1101,43 @@ class PDFDocumentProxy {
}
/**
* @returns {Promise<PrimitiveModel>} A promise that is resolved to a view of a primitive inside the document.
* @returns {Promise<PrimitiveModel>} A promise that is resolved to a view of
* a primitive inside the document.
*/
getPrimitiveByPath(path) {
return this._transport.getPrimitiveByPath(path);
}
/**
* @returns {Promise<TreeViewModel>} A promise that is resolved to a tree view of a primitive inside the document.
* @returns {Promise<TreeViewModel[]>} A promise that is resolved to a tree
* view of a primitive inside the document.
*/
getPrimitiveTree(request) {
return this._transport.getPrimitiveTree(request);
}
/**
* @returns {Promise<XRefTable>} A promise that is resolved to a view of the Cross-Reference Table.
* @returns {Promise<XRefTable>} A promise that is resolved to a view of
* the Cross-Reference Table.
*/
getXRefEntries() {
return this._transport.getXRefEntries();
return this._transport.getXrefEntries();
}
/**
* @returns {Promise<string>} A promise that is resolved to a string
* representing the streams decoded contents.
*/
getStreamAsString(path) {
return this._transport.getStreamAsString(path);
}
/**
* @returns {Promise<Blob>} A promise that is resolved to a blob representing
* the image as png.
*/
getImageByPath(path) {
return this._transport.getImageAsBlob(path);
}
/**
@ -2936,11 +2956,18 @@ class WorkerTransport {
return this.messageHandler.sendWithPromise("GetPrimTree", request);
}
getImageAsBlob(path) {
return this.messageHandler.sendWithPromise("GetImageAsBlob", path);
}
getStreamAsString(path) {
return this.messageHandler.sendWithPromise("GetStreamAsString", path);
}
getXrefEntries() {
return this.messageHandler.sendWithPromise("GetXRefEntries", null);
}
saveDocument() {
if (this.annotationStorage.size <= 0) {
warn(

3624
yarn.lock

File diff suppressed because it is too large Load Diff