Add an integration-test for merging a corrupt PDF

Currently when opening a PDF document the following code is used, where `checkFirstPage`/`checkLastPage` helps detect XRef corruption; note 86a18bd5fe/src/core/worker.js (L167-L176)

However when merging a PDF into an existing document the parsing is only "partial"; note 86a18bd5fe/src/core/worker.js (L632-L634)

It seems a little strange to not support corrupt PDFs in a consistent manner in the code-base, hence this patch adds a new `BasePdfManager` helper that handles all the relevant parsing/checking and re-uses that when merging PDFs.
This commit is contained in:
Jonas Jenwald 2026-06-14 09:49:23 +02:00
parent 86a18bd5fe
commit e1c930adfe
3 changed files with 71 additions and 15 deletions

View File

@ -116,6 +116,19 @@ class BasePdfManager {
return this.ensure(this.pdfDocument.catalog, prop, args);
}
async initDocument(recoveryMode) {
await this.ensureDoc("checkHeader");
await this.ensureDoc("parseStartXRef");
await this.ensureDoc("parse", [recoveryMode]);
// Check that at least the first page can be successfully loaded,
// since otherwise the XRef table is definitely not valid.
await this.ensureDoc("checkFirstPage", [recoveryMode]);
// Check that the last page can be successfully loaded, to ensure that
// `numPages` is correct, and fallback to walking the entire /Pages-tree.
await this.ensureDoc("checkLastPage", [recoveryMode]);
}
getPage(pageIndex) {
return this.pdfDocument.getPage(pageIndex);
}

View File

@ -164,16 +164,7 @@ class WorkerMessageHandler {
}
async function loadDocument(recoveryMode) {
await pdfManager.ensureDoc("checkHeader");
await pdfManager.ensureDoc("parseStartXRef");
await pdfManager.ensureDoc("parse", [recoveryMode]);
// Check that at least the first page can be successfully loaded,
// since otherwise the XRef table is definitely not valid.
await pdfManager.ensureDoc("checkFirstPage", [recoveryMode]);
// Check that the last page can be successfully loaded, to ensure that
// `numPages` is correct, and fallback to walking the entire /Pages-tree.
await pdfManager.ensureDoc("checkLastPage", [recoveryMode]);
await pdfManager.initDocument(recoveryMode);
const isPureXfa = await pdfManager.ensureDoc("isPureXfa");
if (isPureXfa) {
@ -629,9 +620,7 @@ class WorkerMessageHandler {
while (true) {
try {
await manager.requestLoadedStream();
await manager.ensureDoc("checkHeader");
await manager.ensureDoc("parseStartXRef");
await manager.ensureDoc("parse", [recoveryMode]);
await manager.initDocument(recoveryMode);
break;
} catch (e) {
if (e instanceof XRefParseException) {

View File

@ -127,9 +127,11 @@ async function waitForHavingContents(page, expected) {
});
return page.waitForFunction(
ex => {
const textLayers = document.querySelectorAll(".textLayer");
const buffer = [];
for (const textLayer of document.querySelectorAll(".textLayer")) {
buffer.push(parseInt(textLayer.textContent.trim(), 10));
for (const [i, textLayer] of textLayers.entries()) {
const text = textLayer.textContent.trim();
buffer.push(typeof ex[i] === "string" ? text : parseInt(text, 10));
}
return ex.length === buffer.length && ex.every((v, i) => v === buffer[i]);
},
@ -3396,6 +3398,58 @@ describe("Reorganize Pages View", () => {
})
);
});
it("should merge a corrupt PDF (with invalid pages /Count) after the current page", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
await waitForThumbnailVisible(page, 1);
// Navigate to page 2 so the merged PDF is inserted after it.
await page.evaluate(() => {
window.PDFViewerApplication.page = 2;
});
await page.waitForFunction(
() => window.PDFViewerApplication.page === 2
);
await waitAndClick(page, getThumbnailSelector(2));
const handleMerged = await createPromise(page, resolve => {
window.PDFViewerApplication.eventBus.on(
"thumbnailsloaded",
resolve,
{ once: true }
);
});
const picker = await page.$("#viewsManagerAddFilePicker");
await picker.uploadFile(
path.join(__dirname, "../pdfs/poppler-91414-0-53.pdf")
);
await awaitPromise(handleMerged);
// Original 3 pages + 1 merged page = 4 pages total.
await page.waitForFunction(
() => parseInt(document.getElementById("pageNumber").max, 10) === 4
);
// Focus must move to the first newly inserted page (page 3, since
// we merged after page 2).
await page.waitForFunction(
() => window.PDFViewerApplication.page === 3
);
// Pages 12 come from the original document, then the page of
// the merged PDF, then page 3 of the original shifted to the end.
await waitForHavingContents(page, [1, 2, "foobar", 3]);
await waitForTextToBe(
page,
"#viewsManagerStatusActionLabel",
`${FSI}1${PDI} selected`
);
})
);
});
});
describe("Drag-and-drop PDF merge", () => {