Handle corrupt PDFs that lack /Kids array and just inline the /Page dictionary (issue 21436)

This basically extends PR 9549 to the fallback `getAllPageDicts` method, which didn't exist at the time, in order to support more cases of corrupt PDF documents.
This commit is contained in:
Jonas Jenwald 2026-06-12 11:43:37 +02:00
parent 63db4bb777
commit 131d6b7d38
5 changed files with 88 additions and 0 deletions

View File

@ -1484,6 +1484,22 @@ class Catalog {
}
}
if (!Array.isArray(kids)) {
// Prevent errors in corrupt PDF documents that violate the
// specification by *inlining* Page dicts (fixes issue21436.pdf).
let type = currentNode.getRaw("Type");
if (type instanceof Ref) {
try {
type = await xref.fetchAsync(type);
} catch (ex) {
addPageError(ex);
break;
}
}
if (isName(type, "Page") || !currentNode.has("Kids")) {
addPageDict(currentNode, null);
break;
}
addPageError(
new FormatError("Page dictionary kids object is not an array.")
);

View File

@ -929,6 +929,7 @@
!issue_cff_unsigned_bbox.pdf
!90ms_rksj_h_sample.pdf
!issue21346.pdf
!issue21436.pdf
!cidfont_cmap_overflow.pdf
!jbig2_file_header.pdf
!text_field_own_canvas_calc.pdf

28
test/pdfs/issue21436.pdf Normal file
View File

@ -0,0 +1,28 @@
%PDF-1.7
1 0 obj
<< /Type /Catalog /Pages 3 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] /Contents 4 0 R /Resources << >> >>
endobj
4 0 obj
<< /Length 18 >>
stream
10 10 180 180 re S
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000219 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
287
%%EOF

View File

@ -2802,6 +2802,13 @@
"link": true,
"type": "eq"
},
{
"id": "issue21436",
"file": "pdfs/issue21436.pdf",
"md5": "93c4292a52db8a37eefd651cb677d2a9",
"rounds": 1,
"type": "eq"
},
{
"id": "txt2pdf",
"file": "pdfs/txt2pdf.pdf",

View File

@ -827,6 +827,42 @@ describe("api", function () {
]);
});
it("creates pdf doc from PDF files, with /Pages tree without /Kids array", async function () {
const loadingTask1 = getDocument(buildGetDocumentParams("issue9540.pdf"));
const loadingTask2 = getDocument(
buildGetDocumentParams("issue21436.pdf")
);
expect(loadingTask1).toBeInstanceOf(PDFDocumentLoadingTask);
expect(loadingTask2).toBeInstanceOf(PDFDocumentLoadingTask);
const pdfDocument1 = await loadingTask1.promise;
const pdfDocument2 = await loadingTask2.promise;
expect(pdfDocument1.numPages).toEqual(1);
expect(pdfDocument2.numPages).toEqual(1);
const pageA = await pdfDocument1.getPage(1);
expect(pageA).toBeInstanceOf(PDFPageProxy);
const opListA = await pageA.getOperatorList();
expect(opListA.fnArray.length).toEqual(19);
expect(opListA.argsArray.length).toEqual(19);
expect(opListA.lastChunk).toEqual(true);
expect(opListA.separateAnnots).toEqual(null);
const pageB = await pdfDocument2.getPage(1);
expect(pageB).toBeInstanceOf(PDFPageProxy);
const opListB = await pageB.getOperatorList();
expect(opListB.fnArray.length).toEqual(1);
expect(opListB.argsArray.length).toEqual(1);
expect(opListB.lastChunk).toEqual(true);
expect(opListB.separateAnnots).toEqual(null);
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
});
it("creates pdf doc from PDF files, with circular references", async function () {
const loadingTask1 = getDocument(
buildGetDocumentParams("poppler-91414-0-53.pdf")