mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-04-10 15:24:03 +02:00
Improve and simplify the PdfTextExtractor implementation
Working on PR 20784, I couldn't help noticing that this code can be improved a little bit. - Only initialize `PdfTextExtractor` in development mode and MOZCENTRAL builds, since it's unused elsewhere. - Re-factor how `PdfTextExtractor` waits for the viewer to be available/ready, by using existing (internal) events. This simplifies the `PdfTextExtractor` class and removes its `setViewer` method, which improves general consistency since normally the viewer-components don't use such a method in that way (here it was effectively used as a stand-in for a `setDocument` method). - Finally, while slightly unrelated, rename the `#getAllTextInProgress` field in the `PDFViewer` class to `#copyAllInProgress` to clearly indicate what it's for since the `getAllText` method is used more generally now.
This commit is contained in:
parent
ae507c49b3
commit
c70ff5a7c0
15
web/app.js
15
web/app.js
@ -264,8 +264,6 @@ const PDFViewerApplication = {
|
||||
}
|
||||
await this._initializeViewerComponents();
|
||||
|
||||
this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
|
||||
|
||||
// Bind the various event handlers *after* the viewer has been
|
||||
// initialized, to prevent errors if an event arrives too soon.
|
||||
this.bindEvents();
|
||||
@ -779,6 +777,17 @@ const PDFViewerApplication = {
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
if (
|
||||
typeof PDFJSDev === "undefined" ||
|
||||
PDFJSDev.test("TESTING || MOZCENTRAL")
|
||||
) {
|
||||
this.pdfTextExtractor = new PdfTextExtractor(
|
||||
externalServices,
|
||||
pdfViewer,
|
||||
eventBus
|
||||
);
|
||||
}
|
||||
},
|
||||
|
||||
async run(config) {
|
||||
@ -1151,7 +1160,6 @@ const PDFViewerApplication = {
|
||||
this.pdfViewer.setDocument(null);
|
||||
this.pdfLinkService.setDocument(null);
|
||||
this.pdfDocumentProperties?.setDocument(null);
|
||||
this.pdfTextExtractor?.setViewer(null);
|
||||
}
|
||||
this.pdfLinkService.externalLinkEnabled = true;
|
||||
this.store = null;
|
||||
@ -1455,7 +1463,6 @@ const PDFViewerApplication = {
|
||||
|
||||
const pdfViewer = this.pdfViewer;
|
||||
pdfViewer.setDocument(pdfDocument);
|
||||
this.pdfTextExtractor.setViewer(pdfViewer);
|
||||
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
|
||||
|
||||
this.pdfThumbnailViewer?.setDocument(pdfDocument);
|
||||
|
||||
@ -18,55 +18,42 @@
|
||||
* and passing it back to the external service.
|
||||
*/
|
||||
class PdfTextExtractor {
|
||||
/** @type {PDFViewer} */
|
||||
#pdfViewer;
|
||||
|
||||
/** @type {BaseExternalServices} */
|
||||
#externalServices;
|
||||
|
||||
/**
|
||||
* @type {?Promise<string>}
|
||||
*/
|
||||
/** @type {?Promise<string>} */
|
||||
#textPromise;
|
||||
|
||||
#pendingRequests = new Set();
|
||||
#capability = Promise.withResolvers();
|
||||
|
||||
constructor(externalServices) {
|
||||
constructor(externalServices, pdfViewer, eventBus) {
|
||||
this.#externalServices = externalServices;
|
||||
|
||||
eventBus._on("pagesinit", () => {
|
||||
this.#capability.resolve(pdfViewer);
|
||||
});
|
||||
eventBus._on("pagesdestroy", () => {
|
||||
this.#capability.reject(new Error("pagesdestroy"));
|
||||
this.#textPromise = null;
|
||||
|
||||
this.#capability = Promise.withResolvers();
|
||||
});
|
||||
|
||||
window.addEventListener("requestTextContent", ({ detail }) => {
|
||||
this.extractTextContent(detail.requestId);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* The PDF viewer is required to get the page text.
|
||||
*
|
||||
* @param {PDFViewer | null}
|
||||
*/
|
||||
setViewer(pdfViewer) {
|
||||
this.#pdfViewer = pdfViewer;
|
||||
if (this.#pdfViewer && this.#pendingRequests.size) {
|
||||
// Handle any pending requests that came in while things were loading.
|
||||
for (const pendingRequest of this.#pendingRequests) {
|
||||
this.extractTextContent(pendingRequest);
|
||||
}
|
||||
this.#pendingRequests.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds up all of the text from a PDF.
|
||||
*
|
||||
* @param {number} requestId
|
||||
*/
|
||||
async extractTextContent(requestId) {
|
||||
if (!this.#pdfViewer) {
|
||||
this.#pendingRequests.add(requestId);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.#textPromise) {
|
||||
const textPromise = (this.#textPromise = this.#pdfViewer.getAllText());
|
||||
const textPromise = (this.#textPromise = this.#capability.promise.then(
|
||||
pdfViewer => pdfViewer.getAllText()
|
||||
));
|
||||
|
||||
// After the text resolves, cache the text for a little bit in case
|
||||
// multiple consumers call it.
|
||||
|
||||
@ -268,7 +268,7 @@ class PDFViewer {
|
||||
|
||||
#switchAnnotationEditorModeTimeoutId = null;
|
||||
|
||||
#getAllTextInProgress = false;
|
||||
#copyAllInProgress = false;
|
||||
|
||||
#hiddenCopyElement = null;
|
||||
|
||||
@ -816,13 +816,13 @@ class PDFViewer {
|
||||
// has been selected.
|
||||
|
||||
if (
|
||||
this.#getAllTextInProgress ||
|
||||
this.#copyAllInProgress ||
|
||||
textLayerMode === TextLayerMode.ENABLE_PERMISSIONS
|
||||
) {
|
||||
stopEvent(event);
|
||||
return;
|
||||
}
|
||||
this.#getAllTextInProgress = true;
|
||||
this.#copyAllInProgress = true;
|
||||
|
||||
// TODO: if all the pages are rendered we don't need to wait for
|
||||
// getAllText and we could just get text from the Selection object.
|
||||
@ -855,7 +855,7 @@ class PDFViewer {
|
||||
);
|
||||
})
|
||||
.finally(() => {
|
||||
this.#getAllTextInProgress = false;
|
||||
this.#copyAllInProgress = false;
|
||||
keydownAC.abort();
|
||||
classList.remove("copyAll");
|
||||
});
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user