From 131d6b7d38bb795410bb620da4160ac9460e6683 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 12 Jun 2026 11:43:37 +0200 Subject: [PATCH] Handle corrupt PDFs that lack /Kids array and just inline the /Page dictionary (issue 21436) This basically extends PR 9549 to the fallback `getAllPageDicts` method, which didn't exist at the time, in order to support more cases of corrupt PDF documents. --- src/core/catalog.js | 16 ++++++++++++++++ test/pdfs/.gitignore | 1 + test/pdfs/issue21436.pdf | 28 ++++++++++++++++++++++++++++ test/test_manifest.json | 7 +++++++ test/unit/api_spec.js | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 88 insertions(+) create mode 100644 test/pdfs/issue21436.pdf diff --git a/src/core/catalog.js b/src/core/catalog.js index 836f48302..7c2acbcf6 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1484,6 +1484,22 @@ class Catalog { } } if (!Array.isArray(kids)) { + // Prevent errors in corrupt PDF documents that violate the + // specification by *inlining* Page dicts (fixes issue21436.pdf). + let type = currentNode.getRaw("Type"); + if (type instanceof Ref) { + try { + type = await xref.fetchAsync(type); + } catch (ex) { + addPageError(ex); + break; + } + } + if (isName(type, "Page") || !currentNode.has("Kids")) { + addPageDict(currentNode, null); + break; + } + addPageError( new FormatError("Page dictionary kids object is not an array.") ); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 4d2c9f005..a0fc7d163 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -929,6 +929,7 @@ !issue_cff_unsigned_bbox.pdf !90ms_rksj_h_sample.pdf !issue21346.pdf +!issue21436.pdf !cidfont_cmap_overflow.pdf !jbig2_file_header.pdf !text_field_own_canvas_calc.pdf diff --git a/test/pdfs/issue21436.pdf b/test/pdfs/issue21436.pdf new file mode 100644 index 000000000..92243f359 --- /dev/null +++ b/test/pdfs/issue21436.pdf @@ -0,0 +1,28 @@ +%PDF-1.7 +1 0 obj +<< /Type /Catalog /Pages 3 0 R >> +endobj +2 0 obj +<< /Type /Pages /Kids [3 0 R] /Count 1 >> +endobj +3 0 obj +<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] /Contents 4 0 R /Resources << >> >> +endobj +4 0 obj +<< /Length 18 >> +stream +10 10 180 180 re S +endstream +endobj +xref +0 5 +0000000000 65535 f +0000000009 00000 n +0000000058 00000 n +0000000115 00000 n +0000000219 00000 n +trailer +<< /Size 5 /Root 1 0 R >> +startxref +287 +%%EOF diff --git a/test/test_manifest.json b/test/test_manifest.json index aef2038d3..6581d8314 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -2802,6 +2802,13 @@ "link": true, "type": "eq" }, + { + "id": "issue21436", + "file": "pdfs/issue21436.pdf", + "md5": "93c4292a52db8a37eefd651cb677d2a9", + "rounds": 1, + "type": "eq" + }, { "id": "txt2pdf", "file": "pdfs/txt2pdf.pdf", diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 441421de6..86b9e2c9d 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -827,6 +827,42 @@ describe("api", function () { ]); }); + it("creates pdf doc from PDF files, with /Pages tree without /Kids array", async function () { + const loadingTask1 = getDocument(buildGetDocumentParams("issue9540.pdf")); + const loadingTask2 = getDocument( + buildGetDocumentParams("issue21436.pdf") + ); + + expect(loadingTask1).toBeInstanceOf(PDFDocumentLoadingTask); + expect(loadingTask2).toBeInstanceOf(PDFDocumentLoadingTask); + + const pdfDocument1 = await loadingTask1.promise; + const pdfDocument2 = await loadingTask2.promise; + + expect(pdfDocument1.numPages).toEqual(1); + expect(pdfDocument2.numPages).toEqual(1); + + const pageA = await pdfDocument1.getPage(1); + expect(pageA).toBeInstanceOf(PDFPageProxy); + + const opListA = await pageA.getOperatorList(); + expect(opListA.fnArray.length).toEqual(19); + expect(opListA.argsArray.length).toEqual(19); + expect(opListA.lastChunk).toEqual(true); + expect(opListA.separateAnnots).toEqual(null); + + const pageB = await pdfDocument2.getPage(1); + expect(pageB).toBeInstanceOf(PDFPageProxy); + + const opListB = await pageB.getOperatorList(); + expect(opListB.fnArray.length).toEqual(1); + expect(opListB.argsArray.length).toEqual(1); + expect(opListB.lastChunk).toEqual(true); + expect(opListB.separateAnnots).toEqual(null); + + await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); + }); + it("creates pdf doc from PDF files, with circular references", async function () { const loadingTask1 = getDocument( buildGetDocumentParams("poppler-91414-0-53.pdf")