From b39440b6e014d17bdbedb97126b3a1a8c5f5c7ea Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Wed, 6 May 2026 16:59:15 +0200 Subject: [PATCH] Simplify '#getFilteredPageIndices' and '#resolveInsertAfterIndices' --- src/core/editor/pdf_editor.js | 189 ++++++++++++++-------------------- test/unit/api_spec.js | 68 ++++++++++++ 2 files changed, 145 insertions(+), 112 deletions(-) diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index b6861034b..d839f3cc6 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -563,9 +563,10 @@ class PDFEditor { * @property {Array} [pageIndices] * position of the pages in the final document. * @property {number} [insertAfter] - * 0-based index in the base sequential document after which to insert the - * pages. Sequential pageInfos (those without pageIndices) have their indices - * shifted to accommodate the insertion. Cannot be combined with pageIndices. + * 0-based index in the base sequential sequence after which to insert the + * pages. When every contributing pageInfo has pageIndices, this is + * interpreted against that explicit layout. Cannot be combined with + * pageIndices on the same entry. */ /** @@ -578,81 +579,58 @@ class PDFEditor { if (!document) { return []; } - let keptIndices, keptRanges, deletedIndices, deletedRanges; - for (const page of includePages || []) { - if (Array.isArray(page)) { - (keptRanges ||= []).push(page); - } else { - (keptIndices ||= new Set()).add(page); + const compile = list => { + if (!list?.length) { + return null; } - } - for (const page of excludePages || []) { - if (Array.isArray(page)) { - (deletedRanges ||= []).push(page); - } else { - (deletedIndices ||= new Set()).add(page); + const indices = new Set(); + const ranges = []; + for (const item of list) { + if (Array.isArray(item)) { + ranges.push(item); + } else { + indices.add(item); + } } - } - const indices = []; + return { indices, ranges }; + }; + const matches = (index, { indices, ranges }) => + indices.has(index) || + ranges.some(([start, end]) => index >= start && index <= end); + const inc = compile(includePages); + const exc = compile(excludePages); + const result = []; for (let i = 0, ii = document.numPages; i < ii; i++) { - if (deletedIndices?.has(i)) { + if (exc && matches(i, exc)) { continue; } - if (deletedRanges) { - let isDeleted = false; - for (const [start, end] of deletedRanges) { - if (i >= start && i <= end) { - isDeleted = true; - break; - } - } - if (isDeleted) { - continue; - } - } - let takePage = false; - if (keptIndices) { - takePage = keptIndices.has(i); - } - if (!takePage && keptRanges) { - for (const [start, end] of keptRanges) { - if (i >= start && i <= end) { - takePage = true; - break; - } - } - } - if (!takePage && !keptIndices && !keptRanges) { - takePage = true; - } - if (takePage) { - indices.push(i); + if (!inc || matches(i, inc)) { + result.push(i); } } - return indices; + return result; } /** * Resolve insertAfter pageInfos by converting them (and sequential pageInfos) * to explicit pageIndices, shifting indices to accommodate each insertion. - * insertAfter values are relative to the base sequential sequence (i.e. the - * concatenation of pages from pageInfos that have neither pageIndices nor - * insertAfter), so they are independent of each other. * @param {Array} pageInfos * @returns {Array} */ #resolveInsertAfterIndices(pageInfos) { - // Single pass: build the base sequential sequence and collect insertAfter - // entries, computing each pageInfo's filtered page count only once and only - // for pageInfos that actually contribute pages. - const sequence = []; // each element is the index into pageInfos + const counts = new Array(pageInfos.length); + const sequence = []; const insertAfterList = []; for (let i = 0; i < pageInfos.length; i++) { const info = pageInfos[i]; - if (!info.document || info.pageIndices) { + if (!info.document) { + counts[i] = 0; + continue; + } + const count = (counts[i] = this.#getFilteredPageIndices(info).length); + if (info.pageIndices) { continue; } - const count = this.#getFilteredPageIndices(info).length; if (info.insertAfter === undefined) { for (let j = 0; j < count; j++) { sequence.push(i); @@ -661,50 +639,49 @@ class PDFEditor { insertAfterList.push({ i, insertAfter: info.insertAfter, count }); } } + if (insertAfterList.length === 0) { + return pageInfos; + } - insertAfterList.sort((a, b) => a.insertAfter - b.insertAfter); + // Partial pageIndices rely on auto-fill in extractPages, which races with + // the slots insertAfter assigns here. + for (let i = 0; i < pageInfos.length; i++) { + const info = pageInfos[i]; + if ( + info.document && + info.pageIndices && + info.pageIndices.length < counts[i] + ) { + throw new Error( + "extractPages: partial pageIndices cannot be combined with insertAfter entries." + ); + } + } - // Partial pageIndices would auto-fill into free slots at extraction time, - // which collides with the positions insertAfter shifts/assigns. Reject. - if (insertAfterList.length > 0) { + insertAfterList.sort((a, b) => a.insertAfter - b.insertAfter || a.i - b.i); + + // If there is no base sequential sequence, resolve insertAfter against the + // explicit layout. Shift pageIndices values but keep their array order: + // extractPages maps each filtered source page to the corresponding + // pageIndices entry. + if ( + sequence.length === 0 && + pageInfos.some(info => info.document && info.pageIndices) + ) { + const updatedPageInfos = pageInfos.slice(); + let maxExistingPos = -1; for (const info of pageInfos) { if (!info.document || !info.pageIndices) { continue; } - const filteredCount = this.#getFilteredPageIndices(info).length; - if (info.pageIndices.length < filteredCount) { - throw new Error( - "extractPages: partial pageIndices cannot be combined with insertAfter entries." - ); - } - } - } - - // If the base sequential sequence is empty but some entries carry explicit - // pageIndices (e.g. after a deletion), resolve each insertAfter against - // that explicit layout: shift existing positions past the insertion point - // and fill the gap. Entries without a document are ignored. With no - // explicit entries we fall through to the sequential branch, whose - // Array.splice already clamps to position 0 on an empty sequence. - const hasExplicitLayout = - insertAfterList.length > 0 && - pageInfos.some(info => info.document && info.pageIndices); - if (sequence.length === 0 && hasExplicitLayout) { - const updatedPageInfos = pageInfos.slice(); - let maxExistingPos = -1; - for (const info of pageInfos) { - if (info.document && info.pageIndices) { - for (const idx of info.pageIndices) { - if (idx > maxExistingPos) { - maxExistingPos = idx; - } + for (const idx of info.pageIndices) { + if (idx > maxExistingPos) { + maxExistingPos = idx; } } } let offset = 0; for (const { i, insertAfter, count } of insertAfterList) { - // Clamp to [-1, maxExistingPos] so out-of-range values don't produce - // negative or gap-creating positions. const threshold = Math.min( Math.max(insertAfter, -1) + offset, maxExistingPos @@ -725,49 +702,39 @@ class PDFEditor { ), }; } - const insertedIndices = []; + const pageIndices = []; for (let k = 0; k < count; k++) { - insertedIndices.push(threshold + 1 + k); + pageIndices.push(threshold + 1 + k); } - const newInfo = { - ...updatedPageInfos[i], - pageIndices: insertedIndices, - }; - delete newInfo.insertAfter; - updatedPageInfos[i] = newInfo; + const result = { ...updatedPageInfos[i], pageIndices }; + delete result.insertAfter; + updatedPageInfos[i] = result; offset += count; maxExistingPos += count; } return updatedPageInfos; } - // insertAfter values are relative to the base sequential sequence (stable - // across entries thanks to the sort above); offset converts them to - // current-sequence positions as we splice. let offset = 0; for (const { i, insertAfter, count } of insertAfterList) { - const insertPos = insertAfter + 1 + offset; + const insertPos = Math.max(insertAfter, -1) + 1 + offset; sequence.splice(insertPos, 0, ...new Array(count).fill(i)); offset += count; } - // Map each pageInfo index to its final positions in the sequence using a - // plain array (keys are dense integers so no need for a Map). const pageIndicesArr = new Array(pageInfos.length); for (let pos = 0; pos < sequence.length; pos++) { const infoIdx = sequence[pos]; (pageIndicesArr[infoIdx] ||= []).push(pos); } - // Return updated pageInfos: sequential and insertAfter pageInfos now have - // explicit pageIndices; already-indexed pageInfos are left unchanged. return pageInfos.map((info, i) => { if (!info.document || info.pageIndices) { return info; } - const newInfo = { ...info, pageIndices: pageIndicesArr[i] || [] }; - delete newInfo.insertAfter; - return newInfo; + const result = { ...info, pageIndices: pageIndicesArr[i] || [] }; + delete result.insertAfter; + return result; }); } @@ -792,9 +759,7 @@ class PDFEditor { task ) { this.#primaryDocument = primaryDocument; - if (pageInfos.some(info => info.insertAfter !== undefined)) { - pageInfos = this.#resolveInsertAfterIndices(pageInfos); - } + pageInfos = this.#resolveInsertAfterIndices(pageInfos); const promises = []; let newIndex = 0; this.isSingleFile = diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index a268524be..3f0c5fe35 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5584,6 +5584,35 @@ small scripts as well as for`); return refs; }; + async function checkPageNumberOrder(pageInfos, expected) { + let loadingTask = getDocument( + buildGetDocumentParams("page_with_number.pdf") + ); + let pdfDoc = await loadingTask.promise; + const data = await pdfDoc.extractPages(pageInfos); + await loadingTask.destroy(); + + expect(data).not.toBeNull(); + if (!data) { + return; + } + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + expect(pdfDoc.numPages).toEqual(expected.length); + + for (let i = 0, ii = expected.length; i < ii; i++) { + const pageNum = i + 1; + const pdfPage = await pdfDoc.getPage(pageNum); + const { items } = await pdfPage.getTextContent(); + expect(mergeText(items)) + .withContext(`Page ${pageNum}`) + .toEqual(expected[i]); + } + + await loadingTask.destroy(); + } + describe("Merge pdfs", function () { it("should merge three PDFs", async function () { const loadingTask = getDocument( @@ -7423,6 +7452,45 @@ small scripts as well as for`); await loadingTask.destroy(); }); + it("insertAfter preserves non-monotonic explicit pageIndices", async function () { + // Explicit pageIndices place page 2 before page 1. The inserted page 3 + // lands after position 0, so the final order is "2", "3", "1". + await checkPageNumberOrder( + [ + { document: null, includePages: [0, 1], pageIndices: [1, 0] }, + { document: null, includePages: [2], insertAfter: 0 }, + ], + ["2", "3", "1"] + ); + }); + + it("insertAfter shifts only the values past the threshold in non-monotonic pageIndices", async function () { + // pageIndices [2, 0, 1] place pages 1·2·3 as "2", "3", "1". Inserting + // page 4 after position 1 shifts only values > 1, so [2, 0, 1] becomes + // [3, 0, 1] and the final order is "2", "3", "4", "1". + await checkPageNumberOrder( + [ + { document: null, includePages: [0, 1, 2], pageIndices: [2, 0, 1] }, + { document: null, includePages: [3], insertAfter: 1 }, + ], + ["2", "3", "4", "1"] + ); + }); + + it("insertAfter preserves mixed sequential and explicit layouts", async function () { + // Sequential pages 1 and 2 form the base sequence; page 3 is inserted + // after base position 0. The explicit page 4 already accounts for the + // shifted tail at position 3. + await checkPageNumberOrder( + [ + { document: null, includePages: [0, 1] }, + { document: null, includePages: [3], pageIndices: [3] }, + { document: null, includePages: [2], insertAfter: 0 }, + ], + ["1", "3", "2", "4"] + ); + }); + it("rejects partial pageIndices combined with insertAfter", async function () { // Partial pageIndices + insertAfter could race over the same slots; // the resolver throws and the worker surfaces it as a null result.