pdf.js.mirror/test/integration/text_extractor_spec.mjs

120 lines
3.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Copyright 2026 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { closePages, loadAndWait } from "./test_utils.mjs";
async function dispatchRequestTextContent(page, id) {
return page.evaluate(requestId => {
const event = new CustomEvent("requestTextContent", {
bubbles: true,
cancelable: true,
detail: { requestId },
});
window.dispatchEvent(event);
}, id);
}
async function getReportTextData(page) {
await page.waitForFunction(() => window._reportTextData !== undefined);
return page.evaluate(() => {
const data = window._reportTextData;
delete window._reportTextData;
return data;
});
}
describe("PdfTextExtractor", () => {
describe("Simple multi-page document", () => {
let pages;
beforeEach(async () => {
pages = await loadAndWait("basicapi.pdf", ".textLayer .endOfContent");
});
afterEach(async () => {
await closePages(pages);
});
it("check that all text is extracted", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
await dispatchRequestTextContent(page, 1);
const { text, requestId } = await getReportTextData(page);
expect(text).toEqual(
[
"Table Of Content",
"Chapter 1 .......................................................... 2",
"Paragraph 1.1 ...................................................... 3",
"page 1 / 3",
"Chapter 1",
"page 2 / 3",
"Paragraph 1.1",
"Powered by TCPDF (www.tcpdf.org)",
"page 3 / 3",
].join("\n")
);
expect(requestId).toEqual(1);
})
);
});
});
describe("Multi-page document, with disableAutoFetch=true set", () => {
let pages;
beforeEach(async () => {
pages = await loadAndWait(
"tracemonkey.pdf",
".textLayer .endOfContent",
null,
null,
{
disableAutoFetch: true,
disableStream: true,
}
);
});
afterEach(async () => {
await closePages(pages);
});
it("check that all text is extracted", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
await dispatchRequestTextContent(page, 2);
const { text, requestId } = await getReportTextData(page);
expect(
text.startsWith(
"Trace-based Just-in-Time Type Specialization for Dynamic\nLanguages"
)
).toBeTrue();
expect(
text.endsWith(
"Conference on Virtual Execution Environments, pages 8393. ACM\nPress, 2007."
)
).toBeTrue();
expect(text.length).toEqual(82804);
expect(requestId).toEqual(2);
})
);
});
});
});