forge: basics like getPrimByPath, primTree and stream as string implemented
Some checks failed
CI / Test (20) (push) Has been cancelled
CI / Test (22) (push) Has been cancelled
CI / Test (23) (push) Has been cancelled
CodeQL / Analyze (javascript) (push) Has been cancelled
Lint / Lint (lts/*) (push) Has been cancelled
Types tests / Test (lts/*) (push) Has been cancelled

This commit is contained in:
Kilian Schuettler 2025-02-25 18:15:47 +01:00
parent c4784832ac
commit e8063c88b1
7 changed files with 7270 additions and 2 deletions

View File

@ -0,0 +1,72 @@
import { PDFDocument } from "../../src/core/document.js";
import { Stream } from "../../src/core/stream.js";
import fs from "fs";
import {
getPrim,
getPrimTree, getStreamAsImage,
getStreamAsString
} from "../../src/core/obj_walker.js";
import { retrieveXref } from "../../src/core/retrieve_xref.js";
fs.readFile(
"/home/kschuettler/Dokumente/Scientific Papers/PDF Specification/ISO_32000-2_2020(en).pdf",
(err, data) => {
console.log("reading file");
const stream = new Stream(new Uint8Array(data));
const manager = { enableXfa: false };
const doc = new PDFDocument(manager, stream);
doc.parseStartXRef();
doc.parse(false);
console.log(doc.numPages);
parse(doc);
}
);
async function parse(doc) {
// console.time("xref");
// let table = await retrieveXref(doc);
// console.timeEnd("xref");
console.time("get prim");
const prim = await getPrim("/Page2/Contents/1/", doc);
console.timeEnd("get prim");
// console.log(prim);
let request = {
key: "Page6",
children: [
{key: "CropBox"},
{ key: "Contents", children: [{ key: "1" }] },
{
key: "Resources",
children: [{ key: "ProcSet" }],
},
],
};
console.time("get tree");
const tree = await getPrimTree([request], doc);
console.timeEnd("get tree");
logTree(tree);
console.time("string")
const string = await getStreamAsString("/Page2/Contents/2/Data", doc);
console.timeEnd("string");
console.log(string);
console.time("image")
// const image = await getStreamAsImage("/Page2/Contents/2/Data", doc);
const image = await getStreamAsImage("/Page2/Resources/XObject/Im0/Data", doc);
console.timeEnd("image");
console.log(image);
}
function logTree(tree) {
for (let key in tree) {
let node = tree[key];
let str = " ".repeat(node.depth);
str += !node.container ? " " : node.expanded ? "v " : "> ";
str += node.key + " | " + node.ptype + " | ";
if (node.sub_type !== "-") {
str += node.sub_type + " | ";
}
str += node.value;
console.log(str);
}
}

View File

@ -1,5 +1,5 @@
{
"stableVersion": "4.10.38",
"stableVersion": "5.0.229",
"baseVersion": "7a57af12e13a47927c460e6b739a6ca132e7603d",
"versionPrefix": "5.0."
}
}

385
src/core/obj_walker.js Normal file
View File

@ -0,0 +1,385 @@
import {isDict, isName, Ref} from "./primitives.js";
import {BaseStream} from "./base_stream.js";
import {PDFImage} from "./image.js";
import {PartialEvaluator} from "./evaluator.js";
import {OperatorList} from "./operator_list.js";
import {LocalColorSpaceCache} from "./image_utils.js";
async function getPrim(path, doc) {
const [prim, trace] = await getPrimitive(path, doc);
return toModel(trace[trace.length - 1].key, trace, prim);
}
async function getStreamAsString(path, doc) {
if (!path.endsWith("Data")) {
throw new Error(`Path ${path} does not end with Data!`);
}
const [prim, trace] = await getPrimitive(path.replace("/Data", ""), doc);
if ((!prim) instanceof BaseStream) {
throw new Error(`Selected primitive with path ${path} is not a Stream!`);
}
const bytes = prim.getBytes();
var string = "";
for (var i = 0; i < bytes.length; i++) {
string += String.fromCharCode(bytes[i]);
}
return string;
}
async function getStreamAsImage(path, doc) {
if (!path.endsWith("Data")) {
throw new Error(`Path ${path} does not end with Data!`);
}
const [prim, trace] = await getPrimitive(path.replace("/Data", ""), doc);
if ((!prim) instanceof BaseStream) {
throw new Error(`Selected primitive with path ${path} is not a Stream!`);
}
const info = prim.dict;
if (!info || info.getRaw("Subtype")?.name !== "Image") {
throw new Error(`Selected Stream is not an Image!"`);
}
const page = await doc.getPage(1);
const evaluator = new PartialEvaluator({
xref: doc.xref,
handler: {sendWithPromise: undefined},
pageIndex: 1,
idFactory: page._localIdFactory,
})
const operatorList = new OperatorList();
await evaluator.buildPaintImageXObject({
resources: [],
image: prim,
operatorList,
localImageCache: doc.catalog.globalImageCache,
localColorSpaceCache: new LocalColorSpaceCache(),
})
return operatorList.;
}
async function getPrimitive(path, doc) {
const xref = doc.xref;
let path_arr = parsePath(path);
let [prim, trace] = await getRoot(path_arr[0], doc);
while (path_arr.length > 1) {
path_arr = path_arr.slice(1);
[prim, trace] = resolveStep(xref, prim, trace, path_arr[0]);
}
return [prim, trace];
}
async function getPrimTree(request, doc) {
let results = [];
for (const item of request) {
results = results.concat(await _getPrimTree(item, doc));
}
return results;
}
async function _getPrimTree(request, doc) {
let results = [];
let [prim, trace] = await getRoot(request.key, doc);
const root = toModel(request.key, trace, prim);
results.push(toTreeModel(root, 0, true));
addChildren(root, request, results, prim, doc, trace, 1);
return results;
}
function addChildren(model, request, results, prim, doc, trace, depth) {
for (const child of model.children) {
let childRequest = request.children?.find(c => c.key === child.key);
if (childRequest) {
results.push(toTreeModel(child, depth, true));
expand(results, prim, childRequest, doc, trace, depth + 1);
} else {
results.push(toTreeModel(child, depth, false));
}
}
}
function expand(results, rootPrim, request, doc, trace, depth) {
if (depth > 20) {
throw new Error(`Depth limit exceeded: ${depth}`);
}
let [prim, _trace] = resolveStep(doc.xref, rootPrim, trace, request.key);
const model = toModel(request.key, trace, prim);
addChildren(model, request, results, prim, doc, trace, depth);
}
function toTreeModel(primModel, depth, expand) {
return new TreeViewModel(
depth,
primModel.key,
primModel.ptype,
primModel.sub_type,
primModel.value,
primModel.container,
expand,
primModel.trace
);
}
function isContainer(prim) {
return isDict(prim) || Array.isArray(prim) || isRef(prim) || isStream(prim);
}
async function getRoot(first, doc) {
let root;
let trace = [];
if (first === "Trailer") {
root = doc.xref.trailer;
trace.push({key: first, last_jump: first});
} else if (first.startsWith("Page")) {
const page = await doc.getPage(+first.replace("Page", "") - 1);
const ref = page.ref;
root = doc.xref.fetch(ref);
trace.push({key: first, last_jump: ref.num});
} else {
const ref = new Ref(+first, 0);
root = doc.xref.fetch(ref);
trace.push({key: first, last_jump: ref.num});
}
return [root, trace];
}
function parsePath(path) {
if (Array.isArray(path)) {
return path;
}
if (path.length === 0) {
return [];
}
return path.split("/").filter(x => x !== "");
}
function isRef(obj) {
return obj instanceof Ref;
}
function resolveStep(xref, root, trace, step) {
let prim;
let last_jump = trace[trace.length - 1].last_jump;
if (isDict(root)) {
prim = root.getRaw(step);
} else if (Array.isArray(root)) {
const _step = +step;
if (isNaN(_step) || _step >= root.length || _step < 0) {
throw new Error(
`Invalid step ${step} for Array of length: ${root.length}`
);
}
prim = root[_step];
} else {
throw new Error(
`Unexpected step ${step} at trace: /${trace.map(t => t.key).join("/")}`
);
}
let _trace = copy(trace);
if (isRef(prim)) {
const num = prim.num;
prim = xref.fetch(prim);
_trace.push({key: step, last_jump: num});
} else {
_trace.push({key: step, last_jump: last_jump});
}
return [prim, _trace];
}
function toModel(name, trace, prim) {
const [type, subType] = toType(prim);
var value = primToString(prim);
var children = [];
if (isDict(prim)) {
value = format_dict_content(prim);
const keys = prim.getKeys();
const last = trace[trace.length - 1];
keys.forEach(child => {
let _trace = copy(trace);
_trace.push({key: child, last_jump: last.last_jump});
children.push(toModel(child, _trace, prim.getRaw(child)));
});
} else if (Array.isArray(prim)) {
value = format_arr_content(prim);
const last = trace[trace.length - 1];
for (let i = 0; i < prim.length; i++) {
let _trace = copy(trace);
_trace.push({key: i.toString(), last_jump: last.last_jump});
children.push(toModel(i.toString(), _trace, prim[i]));
}
} else if (isStream(prim)) {
const info_dict = prim.dict;
if (info_dict) {
value = format_dict_content(info_dict);
const keys = info_dict.getKeys();
const last = trace[trace.length - 1];
keys.forEach(child => {
let _trace = copy(trace);
_trace.push({key: child, last_jump: last.last_jump});
children.push(toModel(child, _trace, info_dict.getRaw(child)));
});
let _trace = copy(trace);
_trace.push({key: "Data", last_jump: last.last_jump});
children.push(
new PrimitiveModel("Data", "-", "-", "Stream Data", false, [], _trace)
);
}
}
return new PrimitiveModel(
name,
type,
subType,
value,
isContainer(prim),
children,
trace
);
}
function toType(prim) {
if (isDict(prim)) {
const subType = prim.getRaw("Type");
return ["Dictionary", subType ? subType.name : "-"];
} else if (Array.isArray(prim)) {
return ["Array", "-"];
} else if (isStream(prim)) {
const subType = prim.dict?.getRaw("Subtype");
return ["Stream", subType ? subType.name : "-"];
} else if (isName(prim)) {
return ["Name", "-"];
} else if (isInt(prim)) {
return ["Integer", "-"];
} else if (isNum(prim)) {
return ["Number", "-"];
} else if (isBool(prim)) {
return ["Boolean", "-"];
} else if (isString(prim)) {
return ["String", "-"];
} else if (isRef(prim)) {
return ["Reference", "-"];
} else {
console.log(prim);
throw new Error("Unknown prim");
}
}
function copy(trace) {
var _trace = [];
for (let i = 0; i < trace.length; i++) {
_trace.push(trace[i]);
}
return _trace;
}
function isBool(v) {
return typeof v == "boolean";
}
function isInt(v) {
return typeof v == "number" && (v | 0) == v;
}
function isNum(v) {
return typeof v == "number";
}
function isString(v) {
return typeof v == "string";
}
function isStream(v) {
return v instanceof BaseStream;
}
function primToString(prim) {
if (isDict(prim)) {
return "Dictionary";
} else if (Array.isArray(prim)) {
return "Array";
} else if (isStream(prim)) {
return "Stream";
} else if (isName(prim)) {
return prim.name;
} else if (isInt(prim)) {
return prim.toString();
} else if (isNum(prim)) {
return prim.toString();
} else if (isBool(prim)) {
return prim.toString();
} else if (isString(prim)) {
return prim;
} else if (isRef(prim)) {
return "XRef(" + prim.num + ", " + prim.gen + ")";
} else {
console.log(prim);
throw new Error("Unknown prim");
}
}
function format_dict_content(dict) {
let result = "{";
const keys = dict.getKeys();
result += keys
.slice(0, 4)
.map(key => key + ": " + primToString(dict.getRaw(key)))
.join(", ");
if (keys.length > 4) {
result += ",...";
}
result += "}";
return result;
}
function format_arr_content(arr) {
let result = "[";
result += arr
.slice(0, 4)
.map(p => primToString(p))
.join(", ");
if (arr.length > 4) {
result += ",...";
}
result += "]";
return result;
}
class PrimitiveModel {
constructor(
key,
ptype,
sub_type,
value,
container,
children = [],
trace = []
) {
this.key = key;
this.ptype = ptype;
this.sub_type = sub_type;
this.value = value;
this.children = children;
this.trace = trace;
this.container = container;
}
}
class TreeViewModel {
constructor(depth, key, ptype, sub_type, value, container, expanded, trace) {
this.depth = depth;
this.key = key;
this.ptype = ptype;
this.sub_type = sub_type;
this.value = value;
this.container = container;
this.expanded = expanded;
this.trace = trace;
}
}
export {
getPrim,
getPrimTree,
getPrimitive,
getStreamAsString,
getStreamAsImage,
PrimitiveModel,
TreeViewModel,
};

42
src/core/retrieve_xref.js Normal file
View File

@ -0,0 +1,42 @@
import {Dict, Ref} from "./primitives.js";
import {BaseStream} from "./base_stream.js";
async function retrieveXref(doc) {
let result = new XRefTable(doc.xref.entries.length);
for (let i = 0; i < doc.xref.entries.length; i++) {
result.entries.push(to_model(i, doc.xref.entries[i], doc.xref));
}
return result;
}
function to_model(i, entry, xref) {
if (entry.free) {
return new XRefEntry("Free", i, entry.gen, entry.offset);
}
const fetched = xref.fetch(new Ref(i, entry.gen));
let type = "Unknown";
if (fetched instanceof Dict) {
type = "Dictionary";
} else if (fetched instanceof BaseStream) {
type = "Stream";
}
return new XRefEntry(type, i, entry.gen, entry.offset);
}
class XRefTable {
constructor(size) {
this.size = size;
this.entries = [];
}
}
class XRefEntry {
constructor(obj_type, obj_num, gen_num, offset) {
this.obj_type = obj_type;
this.obj_num = obj_num;
this.gen_num = gen_num;
this.offset = offset;
}
}
export { XRefEntry, XRefTable, retrieveXref };

View File

@ -38,6 +38,8 @@ import { clearGlobalCaches } from "./cleanup_helper.js";
import { incrementalUpdate } from "./writer.js";
import { PDFWorkerStream } from "./worker_stream.js";
import { StructTreeRoot } from "./struct_tree.js";
import {getPrim, getPrimTree} from "./obj_walker.js";
import {retrieveXref} from "./retrieve_xref.js";
class WorkerTask {
constructor(name) {
@ -484,6 +486,18 @@ class WorkerMessageHandler {
});
});
handler.on("GetPrimitiveByPath", function (path_str) {
return getPrim(path_str, pdfManager.pdfDocument);
});
handler.on("GetXRefEntries", function (data) {
return retrieveXref(pdfManager.pdfDocument);
});
handler.on("GetPrimTree", function (request) {
return getPrimTree(request, pdfManager.pdfDocument);
});
handler.on("GetAnnotations", function ({ pageIndex, intent }) {
return pdfManager.getPage(pageIndex).then(function (page) {
const task = new WorkerTask(`GetAnnotations: page ${pageIndex}`);

View File

@ -1100,6 +1100,26 @@ class PDFDocumentProxy {
return this._transport.downloadInfoCapability.promise;
}
/**
* @returns {Promise<PrimitiveModel>} A promise that is resolved to a view of a primitive inside the document.
*/
getPrimitiveByPath(path) {
return this._transport.getPrimitiveByPath(path);
}
/**
* @returns {Promise<TreeViewModel>} A promise that is resolved to a tree view of a primitive inside the document.
*/
getPrimitiveTree(request) {
return this._transport.getPrimitiveTree(request);
}
/**
* @returns {Promise<XRefTable>} A promise that is resolved to a view of the Cross-Reference Table.
*/
getXRefEntries() {
return this._transport.getXRefEntries();
}
/**
* Cleans up resources allocated by the document on both the main and worker
* threads.
@ -2908,6 +2928,19 @@ class WorkerTransport {
return this.messageHandler.sendWithPromise("GetData", null);
}
getPrimitiveByPath(path) {
return this.messageHandler.sendWithPromise("GetPrimitiveByPath", path);
}
getPrimitiveTree(request) {
return this.messageHandler.sendWithPromise("GetPrimTree", request);
}
getXrefEntries() {
return this.messageHandler.sendWithPromise("GetXRefEntries", null);
}
saveDocument() {
if (this.annotationStorage.size <= 0) {
warn(

6722
yarn.lock Normal file

File diff suppressed because it is too large Load Diff