From 22871eef2310831164fc1984b5eda29ffac51e27 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 22 Jun 2026 14:32:19 +0200 Subject: [PATCH] Improve the `sink` handling in `getTextContent` for Highlight annotations (PR 20019 follow-up) Currently there's a couple of issues related to the `sink` handling: - The `Page.prototype.extractTextContent` method is invoked with options that it doesn't actually use; note https://github.com/mozilla/pdf.js/blob/1ddf6449ac4e5c8157249dd0cd584112104fdd60/src/core/document.js#L669-L676 - When parsing "nested" textContent, i.e. /Form /XObjects, we end up wrongly treating repeated /XObjects as empty for the annotations use-case since `enqueue` is never invoked; note https://github.com/mozilla/pdf.js/blob/1ddf6449ac4e5c8157249dd0cd584112104fdd60/src/core/evaluator.js#L3439 and https://github.com/mozilla/pdf.js/blob/1ddf6449ac4e5c8157249dd0cd584112104fdd60/src/core/evaluator.js#L3449-L3451 - The `getTextContent` method might become ever so slightly slower by having to defer parsing at every step, given the "bad" fallback value when comparing with the `TEXT_CONTENT_CHUNK_SIZE` constant (in the API), note https://github.com/mozilla/pdf.js/blob/1ddf6449ac4e5c8157249dd0cd584112104fdd60/src/display/api.js#L1705 and https://github.com/mozilla/pdf.js/blob/1ddf6449ac4e5c8157249dd0cd584112104fdd60/src/core/evaluator.js#L3566 - Having the `sink` now be effectively optional, in the `getTextContent` method, does complicate the code slightly overall. To address these things this patch ensures that a `sink` will always be available, by re-using the `sinkWrapper` structure from the "nested" textContent case, and with reasonable default values. --- src/core/document.js | 2 -- src/core/evaluator.js | 28 +++++++--------------------- src/core/evaluator_utils.js | 24 +++++++++++++++++++++++- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 7ab48a6bd..a2eaa0737 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -789,8 +789,6 @@ class Page { includeMarkedContent: false, disableNormalization: false, sink: null, - viewBox: this.view, - lang: null, intersector, }).then(() => { intersector.setText(); diff --git a/src/core/evaluator.js b/src/core/evaluator.js index be6031937..97736b6c0 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -78,6 +78,7 @@ import { LocalTilingPatternCache, RegionalImageCache, } from "./image_utils.js"; +import { parseMarkedContentProps, textSinkWrapper } from "./evaluator_utils.js"; import { BaseStream } from "./base_stream.js"; import { bidi } from "./bidi.js"; import { ColorSpace } from "./colorspace.js"; @@ -87,7 +88,6 @@ import { getGlyphsUnicode } from "./glyphlist.js"; import { getMetrics } from "./metrics.js"; import { getUnicodeForGlyph } from "./unicode.js"; import { MurmurHash3_64 } from "../shared/murmurhash3.js"; -import { parseMarkedContentProps } from "./evaluator_utils.js"; import { PDFImage } from "./image.js"; import { Stream } from "./stream.js"; import { stringToPDFString } from "./string_utils.js"; @@ -2380,6 +2380,7 @@ class PartialEvaluator { stream = new Stream(bytes, 0, bytes.length, stream.dict); } } + sink ??= textSinkWrapper(null); const objId = stream.dict?.objId; const seenRefs = new RefSet(prevRefs); @@ -3153,7 +3154,7 @@ class PartialEvaluator { if (batch && length < TEXT_CHUNK_BATCH_SIZE) { return; } - sink?.enqueue(textContent, length); + sink.enqueue(textContent, length); textContent.items = []; textContent.styles = Object.create(null); } @@ -3163,7 +3164,7 @@ class PartialEvaluator { return new Promise(function promiseBody(resolve, reject) { const next = function (promise) { enqueueChunk(/* batch = */ true); - Promise.all([promise, sink?.ready]).then(function () { + Promise.all([promise, sink.ready]).then(function () { try { promiseBody(resolve, reject); } catch (ex) { @@ -3409,22 +3410,7 @@ class PartialEvaluator { // Enqueue the `textContent` chunk before parsing the /Form // XObject. enqueueChunk(); - const sinkWrapper = { - enqueueInvoked: false, - - enqueue(chunk, size) { - this.enqueueInvoked = true; - sink.enqueue(chunk, size); - }, - - get desiredSize() { - return sink.desiredSize ?? 0; - }, - - get ready() { - return sink.ready; - }, - }; + const sinkWrapper = textSinkWrapper(sink); self .getTextContent({ @@ -3436,7 +3422,7 @@ class PartialEvaluator { : resources, stateManager: xObjStateManager, includeMarkedContent, - sink: sink && sinkWrapper, + sink: sinkWrapper, seenStyles, viewBox, lang, @@ -3563,7 +3549,7 @@ class PartialEvaluator { } break; } // switch - if (textContent.items.length >= (sink?.desiredSize ?? 1)) { + if (textContent.items.length >= sink.desiredSize) { // Wait for ready, if we reach highWaterMark. stop = true; break; diff --git a/src/core/evaluator_utils.js b/src/core/evaluator_utils.js index b871bfa93..b2d4b6fe6 100644 --- a/src/core/evaluator_utils.js +++ b/src/core/evaluator_utils.js @@ -16,6 +16,28 @@ import { Dict, Name, Ref } from "./primitives.js"; import { FormatError, warn } from "../shared/util.js"; +function textSinkWrapper(sink) { + const TEXT_CONTENT_CHUNK_SIZE = 100; // Same as in `src/display/api.js`. + const resolved = sink ? null : Promise.resolve(); + + return { + enqueueInvoked: false, + + enqueue(chunk, size) { + this.enqueueInvoked = true; + sink?.enqueue(chunk, size); + }, + + get desiredSize() { + return sink?.desiredSize ?? TEXT_CONTENT_CHUNK_SIZE; + }, + + get ready() { + return sink?.ready ?? resolved; + }, + }; +} + function _parseVisibilityExpression( xref, array, @@ -120,4 +142,4 @@ function parseMarkedContentProps(xref, contentProperties, resources) { return null; } -export { parseMarkedContentProps }; +export { parseMarkedContentProps, textSinkWrapper };