mirror of
https://github.com/mozilla/pdf.js.git
synced 2026-06-26 01:55:48 +02:00
Improve and simplify the PdfTextExtractor implementation
Working on PR 20784, I couldn't help noticing that this code can be improved a little bit. - Only initialize `PdfTextExtractor` in development mode and MOZCENTRAL builds, since it's unused elsewhere. - Re-factor how `PdfTextExtractor` waits for the viewer to be available/ready, by using existing (internal) events. This simplifies the `PdfTextExtractor` class and removes its `setViewer` method, which improves general consistency since normally the viewer-components don't use such a method in that way (here it was effectively used as a stand-in for a `setDocument` method). - Finally, while slightly unrelated, rename the `#getAllTextInProgress` field in the `PDFViewer` class to `#copyAllInProgress` to clearly indicate what it's for since the `getAllText` method is used more generally now.
This commit is contained in:
parent
ae507c49b3
commit
c70ff5a7c0
15
web/app.js
15
web/app.js
@ -264,8 +264,6 @@ const PDFViewerApplication = {
|
|||||||
}
|
}
|
||||||
await this._initializeViewerComponents();
|
await this._initializeViewerComponents();
|
||||||
|
|
||||||
this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
|
|
||||||
|
|
||||||
// Bind the various event handlers *after* the viewer has been
|
// Bind the various event handlers *after* the viewer has been
|
||||||
// initialized, to prevent errors if an event arrives too soon.
|
// initialized, to prevent errors if an event arrives too soon.
|
||||||
this.bindEvents();
|
this.bindEvents();
|
||||||
@ -779,6 +777,17 @@ const PDFViewerApplication = {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
typeof PDFJSDev === "undefined" ||
|
||||||
|
PDFJSDev.test("TESTING || MOZCENTRAL")
|
||||||
|
) {
|
||||||
|
this.pdfTextExtractor = new PdfTextExtractor(
|
||||||
|
externalServices,
|
||||||
|
pdfViewer,
|
||||||
|
eventBus
|
||||||
|
);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
async run(config) {
|
async run(config) {
|
||||||
@ -1151,7 +1160,6 @@ const PDFViewerApplication = {
|
|||||||
this.pdfViewer.setDocument(null);
|
this.pdfViewer.setDocument(null);
|
||||||
this.pdfLinkService.setDocument(null);
|
this.pdfLinkService.setDocument(null);
|
||||||
this.pdfDocumentProperties?.setDocument(null);
|
this.pdfDocumentProperties?.setDocument(null);
|
||||||
this.pdfTextExtractor?.setViewer(null);
|
|
||||||
}
|
}
|
||||||
this.pdfLinkService.externalLinkEnabled = true;
|
this.pdfLinkService.externalLinkEnabled = true;
|
||||||
this.store = null;
|
this.store = null;
|
||||||
@ -1455,7 +1463,6 @@ const PDFViewerApplication = {
|
|||||||
|
|
||||||
const pdfViewer = this.pdfViewer;
|
const pdfViewer = this.pdfViewer;
|
||||||
pdfViewer.setDocument(pdfDocument);
|
pdfViewer.setDocument(pdfDocument);
|
||||||
this.pdfTextExtractor.setViewer(pdfViewer);
|
|
||||||
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
|
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
|
||||||
|
|
||||||
this.pdfThumbnailViewer?.setDocument(pdfDocument);
|
this.pdfThumbnailViewer?.setDocument(pdfDocument);
|
||||||
|
|||||||
@ -18,55 +18,42 @@
|
|||||||
* and passing it back to the external service.
|
* and passing it back to the external service.
|
||||||
*/
|
*/
|
||||||
class PdfTextExtractor {
|
class PdfTextExtractor {
|
||||||
/** @type {PDFViewer} */
|
/** @type {BaseExternalServices} */
|
||||||
#pdfViewer;
|
|
||||||
|
|
||||||
#externalServices;
|
#externalServices;
|
||||||
|
|
||||||
/**
|
/** @type {?Promise<string>} */
|
||||||
* @type {?Promise<string>}
|
|
||||||
*/
|
|
||||||
#textPromise;
|
#textPromise;
|
||||||
|
|
||||||
#pendingRequests = new Set();
|
#capability = Promise.withResolvers();
|
||||||
|
|
||||||
constructor(externalServices) {
|
constructor(externalServices, pdfViewer, eventBus) {
|
||||||
this.#externalServices = externalServices;
|
this.#externalServices = externalServices;
|
||||||
|
|
||||||
|
eventBus._on("pagesinit", () => {
|
||||||
|
this.#capability.resolve(pdfViewer);
|
||||||
|
});
|
||||||
|
eventBus._on("pagesdestroy", () => {
|
||||||
|
this.#capability.reject(new Error("pagesdestroy"));
|
||||||
|
this.#textPromise = null;
|
||||||
|
|
||||||
|
this.#capability = Promise.withResolvers();
|
||||||
|
});
|
||||||
|
|
||||||
window.addEventListener("requestTextContent", ({ detail }) => {
|
window.addEventListener("requestTextContent", ({ detail }) => {
|
||||||
this.extractTextContent(detail.requestId);
|
this.extractTextContent(detail.requestId);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* The PDF viewer is required to get the page text.
|
|
||||||
*
|
|
||||||
* @param {PDFViewer | null}
|
|
||||||
*/
|
|
||||||
setViewer(pdfViewer) {
|
|
||||||
this.#pdfViewer = pdfViewer;
|
|
||||||
if (this.#pdfViewer && this.#pendingRequests.size) {
|
|
||||||
// Handle any pending requests that came in while things were loading.
|
|
||||||
for (const pendingRequest of this.#pendingRequests) {
|
|
||||||
this.extractTextContent(pendingRequest);
|
|
||||||
}
|
|
||||||
this.#pendingRequests.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds up all of the text from a PDF.
|
* Builds up all of the text from a PDF.
|
||||||
*
|
*
|
||||||
* @param {number} requestId
|
* @param {number} requestId
|
||||||
*/
|
*/
|
||||||
async extractTextContent(requestId) {
|
async extractTextContent(requestId) {
|
||||||
if (!this.#pdfViewer) {
|
|
||||||
this.#pendingRequests.add(requestId);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!this.#textPromise) {
|
if (!this.#textPromise) {
|
||||||
const textPromise = (this.#textPromise = this.#pdfViewer.getAllText());
|
const textPromise = (this.#textPromise = this.#capability.promise.then(
|
||||||
|
pdfViewer => pdfViewer.getAllText()
|
||||||
|
));
|
||||||
|
|
||||||
// After the text resolves, cache the text for a little bit in case
|
// After the text resolves, cache the text for a little bit in case
|
||||||
// multiple consumers call it.
|
// multiple consumers call it.
|
||||||
|
|||||||
@ -268,7 +268,7 @@ class PDFViewer {
|
|||||||
|
|
||||||
#switchAnnotationEditorModeTimeoutId = null;
|
#switchAnnotationEditorModeTimeoutId = null;
|
||||||
|
|
||||||
#getAllTextInProgress = false;
|
#copyAllInProgress = false;
|
||||||
|
|
||||||
#hiddenCopyElement = null;
|
#hiddenCopyElement = null;
|
||||||
|
|
||||||
@ -816,13 +816,13 @@ class PDFViewer {
|
|||||||
// has been selected.
|
// has been selected.
|
||||||
|
|
||||||
if (
|
if (
|
||||||
this.#getAllTextInProgress ||
|
this.#copyAllInProgress ||
|
||||||
textLayerMode === TextLayerMode.ENABLE_PERMISSIONS
|
textLayerMode === TextLayerMode.ENABLE_PERMISSIONS
|
||||||
) {
|
) {
|
||||||
stopEvent(event);
|
stopEvent(event);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
this.#getAllTextInProgress = true;
|
this.#copyAllInProgress = true;
|
||||||
|
|
||||||
// TODO: if all the pages are rendered we don't need to wait for
|
// TODO: if all the pages are rendered we don't need to wait for
|
||||||
// getAllText and we could just get text from the Selection object.
|
// getAllText and we could just get text from the Selection object.
|
||||||
@ -855,7 +855,7 @@ class PDFViewer {
|
|||||||
);
|
);
|
||||||
})
|
})
|
||||||
.finally(() => {
|
.finally(() => {
|
||||||
this.#getAllTextInProgress = false;
|
this.#copyAllInProgress = false;
|
||||||
keydownAC.abort();
|
keydownAC.abort();
|
||||||
classList.remove("copyAll");
|
classList.remove("copyAll");
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user