forge: basics like getPrimByPath, primTree and stream as string implemented
Some checks failed
CI / Test (20) (push) Has been cancelled
CI / Test (22) (push) Has been cancelled
CI / Test (23) (push) Has been cancelled
CodeQL / Analyze (javascript) (push) Has been cancelled
Lint / Lint (lts/*) (push) Has been cancelled
Types tests / Test (lts/*) (push) Has been cancelled
Some checks failed
CI / Test (20) (push) Has been cancelled
CI / Test (22) (push) Has been cancelled
CI / Test (23) (push) Has been cancelled
CodeQL / Analyze (javascript) (push) Has been cancelled
Lint / Lint (lts/*) (push) Has been cancelled
Types tests / Test (lts/*) (push) Has been cancelled
This commit is contained in:
parent
c4784832ac
commit
e8063c88b1
72
examples/primitive-fetching/fetch_primitives.js
Normal file
72
examples/primitive-fetching/fetch_primitives.js
Normal file
@ -0,0 +1,72 @@
|
||||
import { PDFDocument } from "../../src/core/document.js";
|
||||
import { Stream } from "../../src/core/stream.js";
|
||||
import fs from "fs";
|
||||
import {
|
||||
getPrim,
|
||||
getPrimTree, getStreamAsImage,
|
||||
getStreamAsString
|
||||
} from "../../src/core/obj_walker.js";
|
||||
import { retrieveXref } from "../../src/core/retrieve_xref.js";
|
||||
|
||||
fs.readFile(
|
||||
"/home/kschuettler/Dokumente/Scientific Papers/PDF Specification/ISO_32000-2_2020(en).pdf",
|
||||
(err, data) => {
|
||||
console.log("reading file");
|
||||
const stream = new Stream(new Uint8Array(data));
|
||||
const manager = { enableXfa: false };
|
||||
const doc = new PDFDocument(manager, stream);
|
||||
doc.parseStartXRef();
|
||||
doc.parse(false);
|
||||
console.log(doc.numPages);
|
||||
parse(doc);
|
||||
}
|
||||
);
|
||||
|
||||
async function parse(doc) {
|
||||
// console.time("xref");
|
||||
// let table = await retrieveXref(doc);
|
||||
// console.timeEnd("xref");
|
||||
console.time("get prim");
|
||||
const prim = await getPrim("/Page2/Contents/1/", doc);
|
||||
console.timeEnd("get prim");
|
||||
// console.log(prim);
|
||||
let request = {
|
||||
key: "Page6",
|
||||
children: [
|
||||
{key: "CropBox"},
|
||||
{ key: "Contents", children: [{ key: "1" }] },
|
||||
{
|
||||
key: "Resources",
|
||||
children: [{ key: "ProcSet" }],
|
||||
},
|
||||
],
|
||||
};
|
||||
console.time("get tree");
|
||||
const tree = await getPrimTree([request], doc);
|
||||
console.timeEnd("get tree");
|
||||
logTree(tree);
|
||||
console.time("string")
|
||||
const string = await getStreamAsString("/Page2/Contents/2/Data", doc);
|
||||
console.timeEnd("string");
|
||||
console.log(string);
|
||||
console.time("image")
|
||||
// const image = await getStreamAsImage("/Page2/Contents/2/Data", doc);
|
||||
const image = await getStreamAsImage("/Page2/Resources/XObject/Im0/Data", doc);
|
||||
console.timeEnd("image");
|
||||
console.log(image);
|
||||
|
||||
}
|
||||
|
||||
function logTree(tree) {
|
||||
for (let key in tree) {
|
||||
let node = tree[key];
|
||||
let str = " ".repeat(node.depth);
|
||||
str += !node.container ? " " : node.expanded ? "v " : "> ";
|
||||
str += node.key + " | " + node.ptype + " | ";
|
||||
if (node.sub_type !== "-") {
|
||||
str += node.sub_type + " | ";
|
||||
}
|
||||
str += node.value;
|
||||
console.log(str);
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,5 @@
|
||||
{
|
||||
"stableVersion": "4.10.38",
|
||||
"stableVersion": "5.0.229",
|
||||
"baseVersion": "7a57af12e13a47927c460e6b739a6ca132e7603d",
|
||||
"versionPrefix": "5.0."
|
||||
}
|
||||
}
|
||||
385
src/core/obj_walker.js
Normal file
385
src/core/obj_walker.js
Normal file
@ -0,0 +1,385 @@
|
||||
import {isDict, isName, Ref} from "./primitives.js";
|
||||
import {BaseStream} from "./base_stream.js";
|
||||
import {PDFImage} from "./image.js";
|
||||
import {PartialEvaluator} from "./evaluator.js";
|
||||
import {OperatorList} from "./operator_list.js";
|
||||
import {LocalColorSpaceCache} from "./image_utils.js";
|
||||
|
||||
async function getPrim(path, doc) {
|
||||
const [prim, trace] = await getPrimitive(path, doc);
|
||||
return toModel(trace[trace.length - 1].key, trace, prim);
|
||||
}
|
||||
|
||||
async function getStreamAsString(path, doc) {
|
||||
if (!path.endsWith("Data")) {
|
||||
throw new Error(`Path ${path} does not end with Data!`);
|
||||
}
|
||||
const [prim, trace] = await getPrimitive(path.replace("/Data", ""), doc);
|
||||
if ((!prim) instanceof BaseStream) {
|
||||
throw new Error(`Selected primitive with path ${path} is not a Stream!`);
|
||||
}
|
||||
const bytes = prim.getBytes();
|
||||
var string = "";
|
||||
for (var i = 0; i < bytes.length; i++) {
|
||||
string += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
return string;
|
||||
}
|
||||
|
||||
async function getStreamAsImage(path, doc) {
|
||||
if (!path.endsWith("Data")) {
|
||||
throw new Error(`Path ${path} does not end with Data!`);
|
||||
}
|
||||
const [prim, trace] = await getPrimitive(path.replace("/Data", ""), doc);
|
||||
if ((!prim) instanceof BaseStream) {
|
||||
throw new Error(`Selected primitive with path ${path} is not a Stream!`);
|
||||
}
|
||||
const info = prim.dict;
|
||||
if (!info || info.getRaw("Subtype")?.name !== "Image") {
|
||||
throw new Error(`Selected Stream is not an Image!"`);
|
||||
}
|
||||
const page = await doc.getPage(1);
|
||||
const evaluator = new PartialEvaluator({
|
||||
xref: doc.xref,
|
||||
handler: {sendWithPromise: undefined},
|
||||
pageIndex: 1,
|
||||
idFactory: page._localIdFactory,
|
||||
})
|
||||
const operatorList = new OperatorList();
|
||||
await evaluator.buildPaintImageXObject({
|
||||
resources: [],
|
||||
image: prim,
|
||||
operatorList,
|
||||
localImageCache: doc.catalog.globalImageCache,
|
||||
localColorSpaceCache: new LocalColorSpaceCache(),
|
||||
})
|
||||
return operatorList.;
|
||||
}
|
||||
|
||||
async function getPrimitive(path, doc) {
|
||||
const xref = doc.xref;
|
||||
let path_arr = parsePath(path);
|
||||
let [prim, trace] = await getRoot(path_arr[0], doc);
|
||||
while (path_arr.length > 1) {
|
||||
path_arr = path_arr.slice(1);
|
||||
[prim, trace] = resolveStep(xref, prim, trace, path_arr[0]);
|
||||
}
|
||||
return [prim, trace];
|
||||
}
|
||||
|
||||
async function getPrimTree(request, doc) {
|
||||
let results = [];
|
||||
for (const item of request) {
|
||||
results = results.concat(await _getPrimTree(item, doc));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function _getPrimTree(request, doc) {
|
||||
let results = [];
|
||||
let [prim, trace] = await getRoot(request.key, doc);
|
||||
const root = toModel(request.key, trace, prim);
|
||||
results.push(toTreeModel(root, 0, true));
|
||||
addChildren(root, request, results, prim, doc, trace, 1);
|
||||
return results;
|
||||
}
|
||||
|
||||
function addChildren(model, request, results, prim, doc, trace, depth) {
|
||||
for (const child of model.children) {
|
||||
let childRequest = request.children?.find(c => c.key === child.key);
|
||||
if (childRequest) {
|
||||
results.push(toTreeModel(child, depth, true));
|
||||
expand(results, prim, childRequest, doc, trace, depth + 1);
|
||||
} else {
|
||||
results.push(toTreeModel(child, depth, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function expand(results, rootPrim, request, doc, trace, depth) {
|
||||
if (depth > 20) {
|
||||
throw new Error(`Depth limit exceeded: ${depth}`);
|
||||
}
|
||||
let [prim, _trace] = resolveStep(doc.xref, rootPrim, trace, request.key);
|
||||
const model = toModel(request.key, trace, prim);
|
||||
addChildren(model, request, results, prim, doc, trace, depth);
|
||||
}
|
||||
|
||||
function toTreeModel(primModel, depth, expand) {
|
||||
return new TreeViewModel(
|
||||
depth,
|
||||
primModel.key,
|
||||
primModel.ptype,
|
||||
primModel.sub_type,
|
||||
primModel.value,
|
||||
primModel.container,
|
||||
expand,
|
||||
primModel.trace
|
||||
);
|
||||
}
|
||||
|
||||
function isContainer(prim) {
|
||||
return isDict(prim) || Array.isArray(prim) || isRef(prim) || isStream(prim);
|
||||
}
|
||||
|
||||
async function getRoot(first, doc) {
|
||||
let root;
|
||||
let trace = [];
|
||||
if (first === "Trailer") {
|
||||
root = doc.xref.trailer;
|
||||
trace.push({key: first, last_jump: first});
|
||||
} else if (first.startsWith("Page")) {
|
||||
const page = await doc.getPage(+first.replace("Page", "") - 1);
|
||||
const ref = page.ref;
|
||||
root = doc.xref.fetch(ref);
|
||||
trace.push({key: first, last_jump: ref.num});
|
||||
} else {
|
||||
const ref = new Ref(+first, 0);
|
||||
root = doc.xref.fetch(ref);
|
||||
trace.push({key: first, last_jump: ref.num});
|
||||
}
|
||||
return [root, trace];
|
||||
}
|
||||
|
||||
function parsePath(path) {
|
||||
if (Array.isArray(path)) {
|
||||
return path;
|
||||
}
|
||||
if (path.length === 0) {
|
||||
return [];
|
||||
}
|
||||
return path.split("/").filter(x => x !== "");
|
||||
}
|
||||
|
||||
function isRef(obj) {
|
||||
return obj instanceof Ref;
|
||||
}
|
||||
|
||||
function resolveStep(xref, root, trace, step) {
|
||||
let prim;
|
||||
let last_jump = trace[trace.length - 1].last_jump;
|
||||
if (isDict(root)) {
|
||||
prim = root.getRaw(step);
|
||||
} else if (Array.isArray(root)) {
|
||||
const _step = +step;
|
||||
if (isNaN(_step) || _step >= root.length || _step < 0) {
|
||||
throw new Error(
|
||||
`Invalid step ${step} for Array of length: ${root.length}`
|
||||
);
|
||||
}
|
||||
prim = root[_step];
|
||||
} else {
|
||||
throw new Error(
|
||||
`Unexpected step ${step} at trace: /${trace.map(t => t.key).join("/")}`
|
||||
);
|
||||
}
|
||||
let _trace = copy(trace);
|
||||
if (isRef(prim)) {
|
||||
const num = prim.num;
|
||||
prim = xref.fetch(prim);
|
||||
_trace.push({key: step, last_jump: num});
|
||||
} else {
|
||||
_trace.push({key: step, last_jump: last_jump});
|
||||
}
|
||||
return [prim, _trace];
|
||||
}
|
||||
|
||||
function toModel(name, trace, prim) {
|
||||
const [type, subType] = toType(prim);
|
||||
var value = primToString(prim);
|
||||
var children = [];
|
||||
if (isDict(prim)) {
|
||||
value = format_dict_content(prim);
|
||||
const keys = prim.getKeys();
|
||||
const last = trace[trace.length - 1];
|
||||
keys.forEach(child => {
|
||||
let _trace = copy(trace);
|
||||
_trace.push({key: child, last_jump: last.last_jump});
|
||||
children.push(toModel(child, _trace, prim.getRaw(child)));
|
||||
});
|
||||
} else if (Array.isArray(prim)) {
|
||||
value = format_arr_content(prim);
|
||||
const last = trace[trace.length - 1];
|
||||
for (let i = 0; i < prim.length; i++) {
|
||||
let _trace = copy(trace);
|
||||
_trace.push({key: i.toString(), last_jump: last.last_jump});
|
||||
children.push(toModel(i.toString(), _trace, prim[i]));
|
||||
}
|
||||
} else if (isStream(prim)) {
|
||||
const info_dict = prim.dict;
|
||||
if (info_dict) {
|
||||
value = format_dict_content(info_dict);
|
||||
const keys = info_dict.getKeys();
|
||||
const last = trace[trace.length - 1];
|
||||
keys.forEach(child => {
|
||||
let _trace = copy(trace);
|
||||
_trace.push({key: child, last_jump: last.last_jump});
|
||||
children.push(toModel(child, _trace, info_dict.getRaw(child)));
|
||||
});
|
||||
let _trace = copy(trace);
|
||||
_trace.push({key: "Data", last_jump: last.last_jump});
|
||||
children.push(
|
||||
new PrimitiveModel("Data", "-", "-", "Stream Data", false, [], _trace)
|
||||
);
|
||||
}
|
||||
}
|
||||
return new PrimitiveModel(
|
||||
name,
|
||||
type,
|
||||
subType,
|
||||
value,
|
||||
isContainer(prim),
|
||||
children,
|
||||
trace
|
||||
);
|
||||
}
|
||||
|
||||
function toType(prim) {
|
||||
if (isDict(prim)) {
|
||||
const subType = prim.getRaw("Type");
|
||||
return ["Dictionary", subType ? subType.name : "-"];
|
||||
} else if (Array.isArray(prim)) {
|
||||
return ["Array", "-"];
|
||||
} else if (isStream(prim)) {
|
||||
const subType = prim.dict?.getRaw("Subtype");
|
||||
return ["Stream", subType ? subType.name : "-"];
|
||||
} else if (isName(prim)) {
|
||||
return ["Name", "-"];
|
||||
} else if (isInt(prim)) {
|
||||
return ["Integer", "-"];
|
||||
} else if (isNum(prim)) {
|
||||
return ["Number", "-"];
|
||||
} else if (isBool(prim)) {
|
||||
return ["Boolean", "-"];
|
||||
} else if (isString(prim)) {
|
||||
return ["String", "-"];
|
||||
} else if (isRef(prim)) {
|
||||
return ["Reference", "-"];
|
||||
} else {
|
||||
console.log(prim);
|
||||
throw new Error("Unknown prim");
|
||||
}
|
||||
}
|
||||
|
||||
function copy(trace) {
|
||||
var _trace = [];
|
||||
for (let i = 0; i < trace.length; i++) {
|
||||
_trace.push(trace[i]);
|
||||
}
|
||||
return _trace;
|
||||
}
|
||||
|
||||
function isBool(v) {
|
||||
return typeof v == "boolean";
|
||||
}
|
||||
|
||||
function isInt(v) {
|
||||
return typeof v == "number" && (v | 0) == v;
|
||||
}
|
||||
|
||||
function isNum(v) {
|
||||
return typeof v == "number";
|
||||
}
|
||||
|
||||
function isString(v) {
|
||||
return typeof v == "string";
|
||||
}
|
||||
|
||||
function isStream(v) {
|
||||
return v instanceof BaseStream;
|
||||
}
|
||||
|
||||
function primToString(prim) {
|
||||
if (isDict(prim)) {
|
||||
return "Dictionary";
|
||||
} else if (Array.isArray(prim)) {
|
||||
return "Array";
|
||||
} else if (isStream(prim)) {
|
||||
return "Stream";
|
||||
} else if (isName(prim)) {
|
||||
return prim.name;
|
||||
} else if (isInt(prim)) {
|
||||
return prim.toString();
|
||||
} else if (isNum(prim)) {
|
||||
return prim.toString();
|
||||
} else if (isBool(prim)) {
|
||||
return prim.toString();
|
||||
} else if (isString(prim)) {
|
||||
return prim;
|
||||
} else if (isRef(prim)) {
|
||||
return "XRef(" + prim.num + ", " + prim.gen + ")";
|
||||
} else {
|
||||
console.log(prim);
|
||||
throw new Error("Unknown prim");
|
||||
}
|
||||
}
|
||||
|
||||
function format_dict_content(dict) {
|
||||
let result = "{";
|
||||
const keys = dict.getKeys();
|
||||
result += keys
|
||||
.slice(0, 4)
|
||||
.map(key => key + ": " + primToString(dict.getRaw(key)))
|
||||
.join(", ");
|
||||
if (keys.length > 4) {
|
||||
result += ",...";
|
||||
}
|
||||
result += "}";
|
||||
return result;
|
||||
}
|
||||
|
||||
function format_arr_content(arr) {
|
||||
let result = "[";
|
||||
result += arr
|
||||
.slice(0, 4)
|
||||
.map(p => primToString(p))
|
||||
.join(", ");
|
||||
if (arr.length > 4) {
|
||||
result += ",...";
|
||||
}
|
||||
result += "]";
|
||||
return result;
|
||||
}
|
||||
|
||||
class PrimitiveModel {
|
||||
constructor(
|
||||
key,
|
||||
ptype,
|
||||
sub_type,
|
||||
value,
|
||||
container,
|
||||
children = [],
|
||||
trace = []
|
||||
) {
|
||||
this.key = key;
|
||||
this.ptype = ptype;
|
||||
this.sub_type = sub_type;
|
||||
this.value = value;
|
||||
this.children = children;
|
||||
this.trace = trace;
|
||||
this.container = container;
|
||||
}
|
||||
}
|
||||
|
||||
class TreeViewModel {
|
||||
constructor(depth, key, ptype, sub_type, value, container, expanded, trace) {
|
||||
this.depth = depth;
|
||||
this.key = key;
|
||||
this.ptype = ptype;
|
||||
this.sub_type = sub_type;
|
||||
this.value = value;
|
||||
this.container = container;
|
||||
this.expanded = expanded;
|
||||
this.trace = trace;
|
||||
}
|
||||
}
|
||||
|
||||
export {
|
||||
getPrim,
|
||||
getPrimTree,
|
||||
getPrimitive,
|
||||
getStreamAsString,
|
||||
getStreamAsImage,
|
||||
PrimitiveModel,
|
||||
TreeViewModel,
|
||||
};
|
||||
42
src/core/retrieve_xref.js
Normal file
42
src/core/retrieve_xref.js
Normal file
@ -0,0 +1,42 @@
|
||||
import {Dict, Ref} from "./primitives.js";
|
||||
import {BaseStream} from "./base_stream.js";
|
||||
|
||||
async function retrieveXref(doc) {
|
||||
let result = new XRefTable(doc.xref.entries.length);
|
||||
for (let i = 0; i < doc.xref.entries.length; i++) {
|
||||
result.entries.push(to_model(i, doc.xref.entries[i], doc.xref));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function to_model(i, entry, xref) {
|
||||
if (entry.free) {
|
||||
return new XRefEntry("Free", i, entry.gen, entry.offset);
|
||||
}
|
||||
const fetched = xref.fetch(new Ref(i, entry.gen));
|
||||
let type = "Unknown";
|
||||
if (fetched instanceof Dict) {
|
||||
type = "Dictionary";
|
||||
} else if (fetched instanceof BaseStream) {
|
||||
type = "Stream";
|
||||
}
|
||||
return new XRefEntry(type, i, entry.gen, entry.offset);
|
||||
}
|
||||
|
||||
class XRefTable {
|
||||
constructor(size) {
|
||||
this.size = size;
|
||||
this.entries = [];
|
||||
}
|
||||
}
|
||||
|
||||
class XRefEntry {
|
||||
constructor(obj_type, obj_num, gen_num, offset) {
|
||||
this.obj_type = obj_type;
|
||||
this.obj_num = obj_num;
|
||||
this.gen_num = gen_num;
|
||||
this.offset = offset;
|
||||
}
|
||||
}
|
||||
|
||||
export { XRefEntry, XRefTable, retrieveXref };
|
||||
@ -38,6 +38,8 @@ import { clearGlobalCaches } from "./cleanup_helper.js";
|
||||
import { incrementalUpdate } from "./writer.js";
|
||||
import { PDFWorkerStream } from "./worker_stream.js";
|
||||
import { StructTreeRoot } from "./struct_tree.js";
|
||||
import {getPrim, getPrimTree} from "./obj_walker.js";
|
||||
import {retrieveXref} from "./retrieve_xref.js";
|
||||
|
||||
class WorkerTask {
|
||||
constructor(name) {
|
||||
@ -484,6 +486,18 @@ class WorkerMessageHandler {
|
||||
});
|
||||
});
|
||||
|
||||
handler.on("GetPrimitiveByPath", function (path_str) {
|
||||
return getPrim(path_str, pdfManager.pdfDocument);
|
||||
});
|
||||
|
||||
handler.on("GetXRefEntries", function (data) {
|
||||
return retrieveXref(pdfManager.pdfDocument);
|
||||
});
|
||||
|
||||
handler.on("GetPrimTree", function (request) {
|
||||
return getPrimTree(request, pdfManager.pdfDocument);
|
||||
});
|
||||
|
||||
handler.on("GetAnnotations", function ({ pageIndex, intent }) {
|
||||
return pdfManager.getPage(pageIndex).then(function (page) {
|
||||
const task = new WorkerTask(`GetAnnotations: page ${pageIndex}`);
|
||||
|
||||
@ -1100,6 +1100,26 @@ class PDFDocumentProxy {
|
||||
return this._transport.downloadInfoCapability.promise;
|
||||
}
|
||||
|
||||
/**
|
||||
* @returns {Promise<PrimitiveModel>} A promise that is resolved to a view of a primitive inside the document.
|
||||
*/
|
||||
getPrimitiveByPath(path) {
|
||||
return this._transport.getPrimitiveByPath(path);
|
||||
}
|
||||
/**
|
||||
* @returns {Promise<TreeViewModel>} A promise that is resolved to a tree view of a primitive inside the document.
|
||||
*/
|
||||
getPrimitiveTree(request) {
|
||||
return this._transport.getPrimitiveTree(request);
|
||||
}
|
||||
|
||||
/**
|
||||
* @returns {Promise<XRefTable>} A promise that is resolved to a view of the Cross-Reference Table.
|
||||
*/
|
||||
getXRefEntries() {
|
||||
return this._transport.getXRefEntries();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleans up resources allocated by the document on both the main and worker
|
||||
* threads.
|
||||
@ -2908,6 +2928,19 @@ class WorkerTransport {
|
||||
return this.messageHandler.sendWithPromise("GetData", null);
|
||||
}
|
||||
|
||||
getPrimitiveByPath(path) {
|
||||
return this.messageHandler.sendWithPromise("GetPrimitiveByPath", path);
|
||||
}
|
||||
|
||||
getPrimitiveTree(request) {
|
||||
return this.messageHandler.sendWithPromise("GetPrimTree", request);
|
||||
}
|
||||
|
||||
getXrefEntries() {
|
||||
return this.messageHandler.sendWithPromise("GetXRefEntries", null);
|
||||
}
|
||||
|
||||
|
||||
saveDocument() {
|
||||
if (this.annotationStorage.size <= 0) {
|
||||
warn(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user