Download test PDFs with the Fetch API

Using the Fetch API simplifies and shortens the `downloadFile` function considerably, since among other things it handles redirects[1] by default.

Also, the regular expression in `downloadManifestFiles` can be replaced with a simple string function now.

---

[1] Implementations of the Fetch API should already prevent e.g. redirect loops and limit the total number of redirects allowed.
This commit is contained in:
Jonas Jenwald 2026-03-24 20:28:45 +01:00
parent 2643125a12
commit 3e0571cd9c

View File

@ -15,9 +15,6 @@
import crypto from "crypto";
import fs from "fs";
import http from "http";
import https from "https";
import { resolve as urlResolve } from "url";
function rewriteWebArchiveUrl(url) {
// Web Archive URLs need to be transformed to add `if_` after the ID.
@ -32,54 +29,21 @@ function rewriteWebArchiveUrl(url) {
return url;
}
function downloadFile(file, url, redirects = 0) {
async function downloadFile(file, url) {
url = rewriteWebArchiveUrl(url);
const protocol = /^https:\/\//.test(url) ? https : http;
return new Promise((resolve, reject) => {
protocol
.get(url, async function (response) {
if ([301, 302, 307, 308].includes(response.statusCode)) {
if (redirects > 10) {
response.resume();
reject(new Error("Too many redirects"));
return;
}
const redirectTo = urlResolve(url, response.headers.location);
try {
await downloadFile(file, redirectTo, ++redirects);
resolve();
} catch (ex) {
response.resume();
reject(ex);
}
return;
}
if (response.statusCode !== 200) {
response.resume();
reject(new Error(`HTTP ${response.statusCode}`));
return;
}
const stream = fs.createWriteStream(file);
stream.on("error", error => reject(error));
stream.on("finish", () => {
stream.end();
resolve();
});
response.pipe(stream);
})
.on("error", error => reject(error));
});
const response = await fetch(url);
if (!response.ok) {
throw new Error(response.statusText);
}
return fs.promises.writeFile(file, response.body);
}
async function downloadManifestFiles(manifest) {
const links = manifest
.filter(item => item.link && !fs.existsSync(item.file))
.map(item => {
let url = fs.readFileSync(`${item.file}.link`).toString();
url = url.replace(/\s+$/, "");
const url = fs.readFileSync(`${item.file}.link`).toString().trimEnd();
return { file: item.file, url };
});