mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-05-31 15:21:00 +02:00
120 lines
3.4 KiB
JavaScript
120 lines
3.4 KiB
JavaScript
/* Copyright 2026 Mozilla Foundation
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
import { closePages, loadAndWait } from "./test_utils.mjs";
|
||
|
||
async function dispatchRequestTextContent(page, id) {
|
||
return page.evaluate(requestId => {
|
||
const event = new CustomEvent("requestTextContent", {
|
||
bubbles: true,
|
||
cancelable: true,
|
||
detail: { requestId },
|
||
});
|
||
window.dispatchEvent(event);
|
||
}, id);
|
||
}
|
||
|
||
async function getReportTextData(page) {
|
||
await page.waitForFunction(() => window._reportTextData !== undefined);
|
||
return page.evaluate(() => {
|
||
const data = window._reportTextData;
|
||
delete window._reportTextData;
|
||
return data;
|
||
});
|
||
}
|
||
|
||
describe("PdfTextExtractor", () => {
|
||
describe("Simple multi-page document", () => {
|
||
let pages;
|
||
|
||
beforeEach(async () => {
|
||
pages = await loadAndWait("basicapi.pdf", ".textLayer .endOfContent");
|
||
});
|
||
|
||
afterEach(async () => {
|
||
await closePages(pages);
|
||
});
|
||
|
||
it("check that all text is extracted", async () => {
|
||
await Promise.all(
|
||
pages.map(async ([browserName, page]) => {
|
||
await dispatchRequestTextContent(page, 1);
|
||
|
||
const { text, requestId } = await getReportTextData(page);
|
||
|
||
expect(text).toEqual(
|
||
[
|
||
"Table Of Content",
|
||
"Chapter 1 .......................................................... 2",
|
||
"Paragraph 1.1 ...................................................... 3",
|
||
"page 1 / 3",
|
||
"Chapter 1",
|
||
"page 2 / 3",
|
||
"Paragraph 1.1",
|
||
"Powered by TCPDF (www.tcpdf.org)",
|
||
"page 3 / 3",
|
||
].join("\n")
|
||
);
|
||
expect(requestId).toEqual(1);
|
||
})
|
||
);
|
||
});
|
||
});
|
||
|
||
describe("Multi-page document, with disableAutoFetch=true set", () => {
|
||
let pages;
|
||
|
||
beforeEach(async () => {
|
||
pages = await loadAndWait(
|
||
"tracemonkey.pdf",
|
||
".textLayer .endOfContent",
|
||
null,
|
||
null,
|
||
{
|
||
disableAutoFetch: true,
|
||
disableStream: true,
|
||
}
|
||
);
|
||
});
|
||
|
||
afterEach(async () => {
|
||
await closePages(pages);
|
||
});
|
||
|
||
it("check that all text is extracted", async () => {
|
||
await Promise.all(
|
||
pages.map(async ([browserName, page]) => {
|
||
await dispatchRequestTextContent(page, 2);
|
||
|
||
const { text, requestId } = await getReportTextData(page);
|
||
|
||
expect(
|
||
text.startsWith(
|
||
"Trace-based Just-in-Time Type Specialization for Dynamic\nLanguages"
|
||
)
|
||
).toBeTrue();
|
||
expect(
|
||
text.endsWith(
|
||
"Conference on Virtual Execution Environments, pages 83–93. ACM\nPress, 2007."
|
||
)
|
||
).toBeTrue();
|
||
expect(text.length).toEqual(82804);
|
||
expect(requestId).toEqual(2);
|
||
})
|
||
);
|
||
});
|
||
});
|
||
});
|