Improve and simplify the PdfTextExtractor implementation

Working on PR 20784, I couldn't help noticing that this code can be improved a little bit.

 - Only initialize `PdfTextExtractor` in development mode and MOZCENTRAL builds, since it's unused elsewhere.

 - Re-factor how `PdfTextExtractor` waits for the viewer to be available/ready, by using existing (internal) events.
   This simplifies the `PdfTextExtractor` class and removes its `setViewer` method, which improves general consistency since normally the viewer-components don't use such a method in that way (here it was effectively used as a stand-in for a `setDocument` method).

 - Finally, while slightly unrelated, rename the `#getAllTextInProgress` field in the `PDFViewer` class to `#copyAllInProgress` to clearly indicate what it's for since the `getAllText` method is used more generally now.
This commit is contained in:
Jonas Jenwald 2026-03-03 14:27:12 +01:00
parent ae507c49b3
commit c70ff5a7c0
3 changed files with 32 additions and 38 deletions

View File

@ -264,8 +264,6 @@ const PDFViewerApplication = {
}
await this._initializeViewerComponents();
this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
// Bind the various event handlers *after* the viewer has been
// initialized, to prevent errors if an event arrives too soon.
this.bindEvents();
@ -779,6 +777,17 @@ const PDFViewerApplication = {
);
};
}
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("TESTING || MOZCENTRAL")
) {
this.pdfTextExtractor = new PdfTextExtractor(
externalServices,
pdfViewer,
eventBus
);
}
},
async run(config) {
@ -1151,7 +1160,6 @@ const PDFViewerApplication = {
this.pdfViewer.setDocument(null);
this.pdfLinkService.setDocument(null);
this.pdfDocumentProperties?.setDocument(null);
this.pdfTextExtractor?.setViewer(null);
}
this.pdfLinkService.externalLinkEnabled = true;
this.store = null;
@ -1455,7 +1463,6 @@ const PDFViewerApplication = {
const pdfViewer = this.pdfViewer;
pdfViewer.setDocument(pdfDocument);
this.pdfTextExtractor.setViewer(pdfViewer);
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
this.pdfThumbnailViewer?.setDocument(pdfDocument);

View File

@ -18,55 +18,42 @@
* and passing it back to the external service.
*/
class PdfTextExtractor {
/** @type {PDFViewer} */
#pdfViewer;
/** @type {BaseExternalServices} */
#externalServices;
/**
* @type {?Promise<string>}
*/
/** @type {?Promise<string>} */
#textPromise;
#pendingRequests = new Set();
#capability = Promise.withResolvers();
constructor(externalServices) {
constructor(externalServices, pdfViewer, eventBus) {
this.#externalServices = externalServices;
eventBus._on("pagesinit", () => {
this.#capability.resolve(pdfViewer);
});
eventBus._on("pagesdestroy", () => {
this.#capability.reject(new Error("pagesdestroy"));
this.#textPromise = null;
this.#capability = Promise.withResolvers();
});
window.addEventListener("requestTextContent", ({ detail }) => {
this.extractTextContent(detail.requestId);
});
}
/**
* The PDF viewer is required to get the page text.
*
* @param {PDFViewer | null}
*/
setViewer(pdfViewer) {
this.#pdfViewer = pdfViewer;
if (this.#pdfViewer && this.#pendingRequests.size) {
// Handle any pending requests that came in while things were loading.
for (const pendingRequest of this.#pendingRequests) {
this.extractTextContent(pendingRequest);
}
this.#pendingRequests.clear();
}
}
/**
* Builds up all of the text from a PDF.
*
* @param {number} requestId
*/
async extractTextContent(requestId) {
if (!this.#pdfViewer) {
this.#pendingRequests.add(requestId);
return;
}
if (!this.#textPromise) {
const textPromise = (this.#textPromise = this.#pdfViewer.getAllText());
const textPromise = (this.#textPromise = this.#capability.promise.then(
pdfViewer => pdfViewer.getAllText()
));
// After the text resolves, cache the text for a little bit in case
// multiple consumers call it.

View File

@ -268,7 +268,7 @@ class PDFViewer {
#switchAnnotationEditorModeTimeoutId = null;
#getAllTextInProgress = false;
#copyAllInProgress = false;
#hiddenCopyElement = null;
@ -816,13 +816,13 @@ class PDFViewer {
// has been selected.
if (
this.#getAllTextInProgress ||
this.#copyAllInProgress ||
textLayerMode === TextLayerMode.ENABLE_PERMISSIONS
) {
stopEvent(event);
return;
}
this.#getAllTextInProgress = true;
this.#copyAllInProgress = true;
// TODO: if all the pages are rendered we don't need to wait for
// getAllText and we could just get text from the Selection object.
@ -855,7 +855,7 @@ class PDFViewer {
);
})
.finally(() => {
this.#getAllTextInProgress = false;
this.#copyAllInProgress = false;
keydownAC.abort();
classList.remove("copyAll");
});